xpk 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -12
- xpk/commands/cluster.py +33 -16
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +23 -20
- xpk/commands/run.py +17 -11
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +64 -19
- xpk/commands/workload.py +154 -319
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +322 -32
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +75 -5
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +11 -3
- xpk/core/gcsfuse.py +8 -5
- xpk/core/kjob.py +57 -18
- xpk/core/nap.py +4 -0
- xpk/core/network.py +11 -21
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +27 -82
- xpk/core/workload_decorators/rdma_decorator.py +3 -3
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
- xpk/parser/cluster.py +15 -6
- xpk/parser/storage.py +14 -3
- xpk/parser/workload.py +59 -31
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/METADATA +60 -4
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/RECORD +40 -40
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -22,22 +22,34 @@ from ruamel import yaml
|
|
|
22
22
|
|
|
23
23
|
from ...utils.console import xpk_exit, xpk_print
|
|
24
24
|
from ...utils.file import ensure_directory_exists
|
|
25
|
-
from ..capacity import
|
|
25
|
+
from ..capacity import (
|
|
26
|
+
B200_DEVICE_TYPE,
|
|
27
|
+
H100_MEGA_DEVICE_TYPE,
|
|
28
|
+
H200_DEVICE_TYPE,
|
|
29
|
+
CapacityType,
|
|
30
|
+
)
|
|
26
31
|
from ..system_characteristics import get_system_characteristics_by_device_type
|
|
27
32
|
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
28
33
|
|
|
34
|
+
|
|
29
35
|
yaml = yaml.YAML()
|
|
30
36
|
|
|
31
37
|
a3mega_device_type = H100_MEGA_DEVICE_TYPE
|
|
32
38
|
a3ultra_device_type = H200_DEVICE_TYPE
|
|
33
|
-
|
|
39
|
+
a4_device_type = B200_DEVICE_TYPE
|
|
40
|
+
supported_device_types = {
|
|
41
|
+
a3mega_device_type,
|
|
42
|
+
a3ultra_device_type,
|
|
43
|
+
a4_device_type,
|
|
44
|
+
}
|
|
34
45
|
blueprint_dependencies_dir = {
|
|
35
46
|
a3mega_device_type: "src/xpk/blueprints/a3mega",
|
|
36
47
|
a3ultra_device_type: "src/xpk/blueprints/a3ultra",
|
|
48
|
+
a4_device_type: "src/xpk/blueprints/a4",
|
|
37
49
|
}
|
|
38
50
|
|
|
39
51
|
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
40
|
-
cluster_toolkit_version = "v1.
|
|
52
|
+
cluster_toolkit_version = "v1.48.0"
|
|
41
53
|
|
|
42
54
|
|
|
43
55
|
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
@@ -50,6 +62,10 @@ def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
|
50
62
|
]
|
|
51
63
|
|
|
52
64
|
|
|
65
|
+
def get_subnetworks_for_a4() -> list[str]:
|
|
66
|
+
return ["gvnic-1"] + [f"rdma-{i}" for i in range(8)]
|
|
67
|
+
|
|
68
|
+
|
|
53
69
|
class BlueprintGeneratorOutput:
|
|
54
70
|
"""BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
|
|
55
71
|
Atributes:
|
|
@@ -157,6 +173,11 @@ class BlueprintGenerator:
|
|
|
157
173
|
"total_min_nodes": system_node_pool_min_node_count,
|
|
158
174
|
"total_max_nodes": 1000,
|
|
159
175
|
},
|
|
176
|
+
"k8s_network_names": {
|
|
177
|
+
"gvnic_prefix": f"{cluster_name}-gpunet-",
|
|
178
|
+
"gvnic_postfix": "-subnet",
|
|
179
|
+
"gvnic_start_index": 0,
|
|
180
|
+
},
|
|
160
181
|
},
|
|
161
182
|
outputs=["instructions"],
|
|
162
183
|
)
|
|
@@ -200,7 +221,7 @@ class BlueprintGenerator:
|
|
|
200
221
|
"install": True,
|
|
201
222
|
"version": "v0.10.0", # TAS feature-gates is enabled in CT
|
|
202
223
|
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
203
|
-
"config_template_vars": {"num_chips":
|
|
224
|
+
"config_template_vars": {"num_chips": num_chips},
|
|
204
225
|
},
|
|
205
226
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
206
227
|
"apply_manifests": [{
|
|
@@ -244,7 +265,7 @@ class BlueprintGenerator:
|
|
|
244
265
|
)
|
|
245
266
|
a3_mega_blueprint = Blueprint(
|
|
246
267
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
247
|
-
gcs_bucket, prefix
|
|
268
|
+
gcs_bucket, cluster_name, prefix
|
|
248
269
|
),
|
|
249
270
|
blueprint_name=blueprint_name,
|
|
250
271
|
toolkit_modules_url=cluster_toolkit_url,
|
|
@@ -261,8 +282,8 @@ class BlueprintGenerator:
|
|
|
261
282
|
blueprint_file_path = self._save_blueprint_to_file(
|
|
262
283
|
blueprint_name, a3_mega_blueprint, prefix
|
|
263
284
|
)
|
|
264
|
-
blueprint_dependencies = self.
|
|
265
|
-
blueprint_name, prefix
|
|
285
|
+
blueprint_dependencies = self._get_blueprint_dependencies(
|
|
286
|
+
a3mega_device_type, blueprint_name, prefix
|
|
266
287
|
)
|
|
267
288
|
xpk_print(f"Blueprint file path: {blueprint_file_path}")
|
|
268
289
|
xpk_print(
|
|
@@ -331,7 +352,7 @@ class BlueprintGenerator:
|
|
|
331
352
|
)
|
|
332
353
|
ml_gke = Blueprint(
|
|
333
354
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
334
|
-
gcs_bucket, prefix
|
|
355
|
+
gcs_bucket, cluster_name, prefix
|
|
335
356
|
),
|
|
336
357
|
blueprint_name=blueprint_name,
|
|
337
358
|
toolkit_modules_url=cluster_toolkit_url,
|
|
@@ -490,6 +511,13 @@ class BlueprintGenerator:
|
|
|
490
511
|
" alias_ip_range=[]}],"
|
|
491
512
|
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
492
513
|
),
|
|
514
|
+
"k8s_network_names": {
|
|
515
|
+
"rdma_prefix": f"{cluster_name}-rdma-sub-",
|
|
516
|
+
"rdma_start_index": 0,
|
|
517
|
+
"rdma_postfix": "",
|
|
518
|
+
"gvnic_prefix": f"{cluster_name}-sub-",
|
|
519
|
+
"gvnic_start_index": 1,
|
|
520
|
+
},
|
|
493
521
|
},
|
|
494
522
|
outputs=["instructions"],
|
|
495
523
|
)
|
|
@@ -546,7 +574,7 @@ class BlueprintGenerator:
|
|
|
546
574
|
"install": True,
|
|
547
575
|
"version": "v0.10.0", # TAS feature-gates is enabled in CT
|
|
548
576
|
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
549
|
-
"config_template_vars": {"num_chips":
|
|
577
|
+
"config_template_vars": {"num_chips": num_chips},
|
|
550
578
|
},
|
|
551
579
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
552
580
|
"apply_manifests": [
|
|
@@ -597,7 +625,7 @@ class BlueprintGenerator:
|
|
|
597
625
|
)
|
|
598
626
|
a3_ultra_blueprint = Blueprint(
|
|
599
627
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
600
|
-
gcs_bucket, prefix
|
|
628
|
+
gcs_bucket, cluster_name, prefix
|
|
601
629
|
),
|
|
602
630
|
blueprint_name=blueprint_name,
|
|
603
631
|
toolkit_modules_url=cluster_toolkit_url,
|
|
@@ -614,8 +642,276 @@ class BlueprintGenerator:
|
|
|
614
642
|
blueprint_file_path = self._save_blueprint_to_file(
|
|
615
643
|
blueprint_name, a3_ultra_blueprint, prefix
|
|
616
644
|
)
|
|
617
|
-
blueprint_dependencies = self.
|
|
618
|
-
blueprint_name, prefix
|
|
645
|
+
blueprint_dependencies = self._get_blueprint_dependencies(
|
|
646
|
+
a3ultra_device_type, blueprint_name, prefix
|
|
647
|
+
)
|
|
648
|
+
return BlueprintGeneratorOutput(
|
|
649
|
+
blueprint_file=blueprint_file_path,
|
|
650
|
+
blueprint_dependencies=blueprint_dependencies,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
def generate_a4_blueprint(
|
|
654
|
+
self,
|
|
655
|
+
project_id: str,
|
|
656
|
+
cluster_name: str,
|
|
657
|
+
blueprint_name: str,
|
|
658
|
+
region: str,
|
|
659
|
+
zone: str,
|
|
660
|
+
auth_cidr: str,
|
|
661
|
+
system_node_pool_machine_type: str,
|
|
662
|
+
reservation: Optional[str | None] = None,
|
|
663
|
+
gcs_bucket: Optional[str | None] = None,
|
|
664
|
+
num_nodes: int = 2,
|
|
665
|
+
prefix: str = "",
|
|
666
|
+
system_node_pool_min_node_count: int = 2,
|
|
667
|
+
capacity_type: CapacityType = CapacityType.ON_DEMAND,
|
|
668
|
+
) -> BlueprintGeneratorOutput:
|
|
669
|
+
"""Create A4 blueprint.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
Returns:
|
|
673
|
+
- Blueprint representing cluster toolkit blueprint
|
|
674
|
+
"""
|
|
675
|
+
nccl_installer_path = (
|
|
676
|
+
f'$(ghpc_stage("{blueprint_name}"))/nccl-rdma-installer-a4.yaml'
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
net_0_id = f"{cluster_name}-net-0"
|
|
680
|
+
gpu_net_0 = DeploymentModule(
|
|
681
|
+
id=net_0_id,
|
|
682
|
+
source="modules/network/vpc",
|
|
683
|
+
settings={
|
|
684
|
+
"network_name": f"{cluster_name}-net-0",
|
|
685
|
+
"mtu": 8896,
|
|
686
|
+
"subnetworks": [{
|
|
687
|
+
"subnet_name": f"{cluster_name}-sub-0",
|
|
688
|
+
"subnet_region": region,
|
|
689
|
+
"subnet_ip": "192.168.0.0/18",
|
|
690
|
+
}],
|
|
691
|
+
"secondary_ranges_list": [{
|
|
692
|
+
"subnetwork_name": f"{cluster_name}-sub-0",
|
|
693
|
+
"ranges": [
|
|
694
|
+
{"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
|
|
695
|
+
{"range_name": "services", "ip_cidr_range": "10.0.32.0/20"},
|
|
696
|
+
],
|
|
697
|
+
}],
|
|
698
|
+
"firewall_rules": [{
|
|
699
|
+
"name": f"{cluster_name}-internal-0",
|
|
700
|
+
"ranges": ["192.168.0.0/16"],
|
|
701
|
+
"allow": [
|
|
702
|
+
{"protocol": "tcp", "ports": ["0-65535"]},
|
|
703
|
+
{"protocol": "udp", "ports": ["0-65535"]},
|
|
704
|
+
{"protocol": "icmp"},
|
|
705
|
+
],
|
|
706
|
+
}],
|
|
707
|
+
},
|
|
708
|
+
)
|
|
709
|
+
net_1_id = f"{cluster_name}-net-1"
|
|
710
|
+
gpu_net_1 = DeploymentModule(
|
|
711
|
+
id=net_1_id,
|
|
712
|
+
source="modules/network/vpc",
|
|
713
|
+
settings={
|
|
714
|
+
"network_name": f"{cluster_name}-net-1",
|
|
715
|
+
"mtu": 8896,
|
|
716
|
+
"subnetworks": [{
|
|
717
|
+
"subnet_name": f"{cluster_name}-sub-1",
|
|
718
|
+
"subnet_region": region,
|
|
719
|
+
"subnet_ip": "192.168.64.0/18",
|
|
720
|
+
}],
|
|
721
|
+
"firewall_rules": [{
|
|
722
|
+
"name": f"{cluster_name}-internal-1",
|
|
723
|
+
"ranges": ["192.168.0.0/16"],
|
|
724
|
+
"allow": [
|
|
725
|
+
{"protocol": "tcp", "ports": ["0-65535"]},
|
|
726
|
+
{"protocol": "udp", "ports": ["0-65535"]},
|
|
727
|
+
{"protocol": "icmp"},
|
|
728
|
+
],
|
|
729
|
+
}],
|
|
730
|
+
},
|
|
731
|
+
)
|
|
732
|
+
rma_net_id = f"{cluster_name}-rdma-net"
|
|
733
|
+
rma_net = DeploymentModule(
|
|
734
|
+
id=rma_net_id,
|
|
735
|
+
source="modules/network/gpu-rdma-vpc",
|
|
736
|
+
settings={
|
|
737
|
+
"network_name": f"{cluster_name}-rdma-net",
|
|
738
|
+
"mtu": 8896,
|
|
739
|
+
"network_profile": f"https://www.googleapis.com/compute/beta/projects/{project_id}/global/networkProfiles/{zone}-vpc-roce",
|
|
740
|
+
"network_routing_mode": "REGIONAL",
|
|
741
|
+
"subnetworks_template": {
|
|
742
|
+
"name_prefix": f"{cluster_name}-rdma-sub",
|
|
743
|
+
"count": 8,
|
|
744
|
+
"ip_range": "192.168.128.0/18",
|
|
745
|
+
"region": region,
|
|
746
|
+
},
|
|
747
|
+
},
|
|
748
|
+
)
|
|
749
|
+
cluster_id = f"{cluster_name}-a4-cluster"
|
|
750
|
+
a4_cluster = DeploymentModule(
|
|
751
|
+
id=cluster_id,
|
|
752
|
+
source="modules/scheduler/gke-cluster",
|
|
753
|
+
use=[net_0_id],
|
|
754
|
+
settings={
|
|
755
|
+
"system_node_pool_machine_type": system_node_pool_machine_type,
|
|
756
|
+
"system_node_pool_node_count": {
|
|
757
|
+
"total_min_nodes": system_node_pool_min_node_count,
|
|
758
|
+
"total_max_nodes": 1000,
|
|
759
|
+
},
|
|
760
|
+
"prefix_with_deployment_name": False,
|
|
761
|
+
"name_suffix": cluster_name,
|
|
762
|
+
"enable_dcgm_monitoring": True,
|
|
763
|
+
"enable_gcsfuse_csi": True,
|
|
764
|
+
"enable_private_endpoint": False,
|
|
765
|
+
"master_authorized_networks": [{
|
|
766
|
+
"cidr_block": auth_cidr,
|
|
767
|
+
"display_name": "kubectl-access-network",
|
|
768
|
+
}],
|
|
769
|
+
"additional_networks": (
|
|
770
|
+
f"$(concat([{{network={cluster_name}-net-1.network_name,"
|
|
771
|
+
f" subnetwork={cluster_name}-net-1.subnetwork_name,"
|
|
772
|
+
f' subnetwork_project="{project_id}", nic_type="GVNIC",'
|
|
773
|
+
" queue_count=null, network_ip=null, stack_type=null,"
|
|
774
|
+
" access_config=[{nat_ip=null, public_ptr_domain_name=null,"
|
|
775
|
+
" network_tier=null}], ipv6_access_config=[],"
|
|
776
|
+
" alias_ip_range=[]}],"
|
|
777
|
+
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
778
|
+
),
|
|
779
|
+
"version_prefix": "1.32.",
|
|
780
|
+
"release_channel": "RAPID",
|
|
781
|
+
"maintenance_exclusions": [{
|
|
782
|
+
"name": "no-minor-or-node-upgrades-indefinite",
|
|
783
|
+
"start_time": "2024-12-01T00:00:00Z",
|
|
784
|
+
"end_time": "2025-12-22T00:00:00Z",
|
|
785
|
+
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
786
|
+
}],
|
|
787
|
+
},
|
|
788
|
+
outputs=["instructions"],
|
|
789
|
+
)
|
|
790
|
+
system, _ = get_system_characteristics_by_device_type(a4_device_type)
|
|
791
|
+
if system is None:
|
|
792
|
+
xpk_print(
|
|
793
|
+
"Error: Could not retrieve system characteristics for"
|
|
794
|
+
f" {a4_device_type} device_type."
|
|
795
|
+
)
|
|
796
|
+
xpk_exit(1)
|
|
797
|
+
gpu_pool = DeploymentModule(
|
|
798
|
+
id=f"{cluster_name}-a4-pool",
|
|
799
|
+
source="modules/compute/gke-node-pool",
|
|
800
|
+
use=[cluster_id],
|
|
801
|
+
settings={
|
|
802
|
+
"machine_type": system.gce_machine_type,
|
|
803
|
+
"auto_upgrade": True,
|
|
804
|
+
"zones": [zone],
|
|
805
|
+
"disk_type": "hyperdisk-balanced",
|
|
806
|
+
"static_node_count": num_nodes,
|
|
807
|
+
"local_ssd_count_ephemeral_storage": 32,
|
|
808
|
+
"spot": capacity_type == CapacityType.SPOT,
|
|
809
|
+
"reservation_affinity": self._getblock_reservation_affinity(
|
|
810
|
+
reservation
|
|
811
|
+
),
|
|
812
|
+
"max_pods_per_node": 32,
|
|
813
|
+
"guest_accelerator": [{
|
|
814
|
+
"type": system.gke_accelerator,
|
|
815
|
+
"count": 8,
|
|
816
|
+
"gpu_driver_installation_config": {
|
|
817
|
+
"gpu_driver_version": "LATEST"
|
|
818
|
+
},
|
|
819
|
+
}],
|
|
820
|
+
"additional_networks": (
|
|
821
|
+
f"$(concat([{{network={cluster_name}-net-1.network_name,"
|
|
822
|
+
f" subnetwork={cluster_name}-net-1.subnetwork_name,"
|
|
823
|
+
f' subnetwork_project="{project_id}", nic_type="GVNIC",'
|
|
824
|
+
" queue_count=null, network_ip=null, stack_type=null,"
|
|
825
|
+
" access_config=[{nat_ip=null, public_ptr_domain_name=null,"
|
|
826
|
+
" network_tier=null}], ipv6_access_config=[],"
|
|
827
|
+
" alias_ip_range=[]}],"
|
|
828
|
+
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
829
|
+
),
|
|
830
|
+
},
|
|
831
|
+
outputs=["instructions"],
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
num_chips = num_nodes * system.chips_per_vm
|
|
835
|
+
workload_manager_install_id = "workload-manager-install"
|
|
836
|
+
workload_manager_install = DeploymentModule(
|
|
837
|
+
id=workload_manager_install_id,
|
|
838
|
+
source="modules/management/kubectl-apply",
|
|
839
|
+
use=[cluster_id],
|
|
840
|
+
settings={
|
|
841
|
+
"kueue": {
|
|
842
|
+
"install": True,
|
|
843
|
+
"version": "v0.10.0", # TAS feature-gates is enabled in CT
|
|
844
|
+
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
845
|
+
"config_template_vars": {"num_chips": num_chips},
|
|
846
|
+
},
|
|
847
|
+
"jobset": {"install": True, "version": "v0.7.2"},
|
|
848
|
+
"apply_manifests": [
|
|
849
|
+
{"source": nccl_installer_path},
|
|
850
|
+
{
|
|
851
|
+
"source": (
|
|
852
|
+
f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
|
|
853
|
+
)
|
|
854
|
+
},
|
|
855
|
+
],
|
|
856
|
+
},
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
workload_configmap = DeploymentModule(
|
|
860
|
+
id="workload_configmap",
|
|
861
|
+
source="modules/management/kubectl-apply",
|
|
862
|
+
use=[cluster_id],
|
|
863
|
+
settings={
|
|
864
|
+
"apply_manifests": [{
|
|
865
|
+
"source": (
|
|
866
|
+
f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
|
|
867
|
+
),
|
|
868
|
+
"template_vars": {
|
|
869
|
+
"resource_config_name": (
|
|
870
|
+
f"{cluster_name}-resources-configmap"
|
|
871
|
+
),
|
|
872
|
+
"num_nodes": f"{num_nodes}",
|
|
873
|
+
"cluster_config_name": f"{cluster_name}-metadata-configmap",
|
|
874
|
+
"capacity_type": f"{capacity_type.value}",
|
|
875
|
+
"reservation": f"{reservation}",
|
|
876
|
+
},
|
|
877
|
+
}]
|
|
878
|
+
},
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
primary_group = DeploymentGroup(
|
|
882
|
+
group="primary",
|
|
883
|
+
modules=[
|
|
884
|
+
gpu_net_0,
|
|
885
|
+
gpu_net_1,
|
|
886
|
+
rma_net,
|
|
887
|
+
a4_cluster,
|
|
888
|
+
gpu_pool,
|
|
889
|
+
workload_manager_install,
|
|
890
|
+
workload_configmap,
|
|
891
|
+
],
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
a4_blueprint = Blueprint(
|
|
895
|
+
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
896
|
+
gcs_bucket, cluster_name, prefix
|
|
897
|
+
),
|
|
898
|
+
blueprint_name=blueprint_name,
|
|
899
|
+
toolkit_modules_url=cluster_toolkit_url,
|
|
900
|
+
toolkit_modules_version=cluster_toolkit_version,
|
|
901
|
+
deployment_groups=[primary_group],
|
|
902
|
+
vars={
|
|
903
|
+
"project_id": project_id,
|
|
904
|
+
"deployment_name": blueprint_name,
|
|
905
|
+
"region": region,
|
|
906
|
+
"zone": zone,
|
|
907
|
+
},
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
blueprint_file_path = self._save_blueprint_to_file(
|
|
911
|
+
blueprint_name, a4_blueprint, prefix
|
|
912
|
+
)
|
|
913
|
+
blueprint_dependencies = self._get_blueprint_dependencies(
|
|
914
|
+
a4_device_type, blueprint_name, prefix
|
|
619
915
|
)
|
|
620
916
|
return BlueprintGeneratorOutput(
|
|
621
917
|
blueprint_file=blueprint_file_path,
|
|
@@ -638,7 +934,7 @@ class BlueprintGenerator:
|
|
|
638
934
|
)
|
|
639
935
|
|
|
640
936
|
def _getblock_terraform_backend(
|
|
641
|
-
self, gcs_bucket: str, prefix: str = ""
|
|
937
|
+
self, gcs_bucket: str, cluster_name: str, prefix: str = ""
|
|
642
938
|
) -> dict | None:
|
|
643
939
|
if gcs_bucket is None:
|
|
644
940
|
return None
|
|
@@ -646,12 +942,19 @@ class BlueprintGenerator:
|
|
|
646
942
|
"type": "gcs",
|
|
647
943
|
"configuration": {
|
|
648
944
|
"bucket": gcs_bucket,
|
|
649
|
-
"prefix": self._get_terraforrm_backend_full_prefix(
|
|
945
|
+
"prefix": self._get_terraforrm_backend_full_prefix(
|
|
946
|
+
cluster_name, prefix
|
|
947
|
+
),
|
|
650
948
|
},
|
|
651
949
|
}
|
|
652
950
|
|
|
653
|
-
def _get_terraforrm_backend_full_prefix(
|
|
654
|
-
|
|
951
|
+
def _get_terraforrm_backend_full_prefix(
|
|
952
|
+
self, cluster_name: str, prefix: str = ""
|
|
953
|
+
) -> str:
|
|
954
|
+
full_prefix = "xpk_terraform_state"
|
|
955
|
+
if prefix:
|
|
956
|
+
full_prefix += f"/{prefix}"
|
|
957
|
+
return f"{full_prefix}/{cluster_name}/"
|
|
655
958
|
|
|
656
959
|
def _save_blueprint_to_file(
|
|
657
960
|
self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
|
|
@@ -676,27 +979,14 @@ class BlueprintGenerator:
|
|
|
676
979
|
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
677
980
|
return os.path.exists(blueprint_path)
|
|
678
981
|
|
|
679
|
-
def
|
|
680
|
-
self, blueprint_name: str, prefix: str = ""
|
|
681
|
-
) -> str:
|
|
682
|
-
deployment_files_path = os.path.join(
|
|
683
|
-
self._get_storage_path(prefix), blueprint_name
|
|
684
|
-
)
|
|
685
|
-
shutil.copytree(
|
|
686
|
-
blueprint_dependencies_dir[a3mega_device_type],
|
|
687
|
-
deployment_files_path,
|
|
688
|
-
dirs_exist_ok=True,
|
|
689
|
-
)
|
|
690
|
-
return deployment_files_path
|
|
691
|
-
|
|
692
|
-
def _get_a3_ultra_blueprint_dependencies(
|
|
693
|
-
self, blueprint_name: str, prefix: str = ""
|
|
982
|
+
def _get_blueprint_dependencies(
|
|
983
|
+
self, device_type: str, blueprint_name: str, prefix: str = ""
|
|
694
984
|
) -> str:
|
|
695
985
|
deployment_files_path = os.path.join(
|
|
696
986
|
self._get_storage_path(prefix), blueprint_name
|
|
697
987
|
)
|
|
698
988
|
shutil.copytree(
|
|
699
|
-
blueprint_dependencies_dir[
|
|
989
|
+
blueprint_dependencies_dir[device_type],
|
|
700
990
|
deployment_files_path,
|
|
701
991
|
dirs_exist_ok=True,
|
|
702
992
|
)
|
xpk/core/capacity.py
CHANGED
xpk/core/cluster.py
CHANGED
|
@@ -32,7 +32,8 @@ from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to
|
|
|
32
32
|
from .nodepool import upgrade_gke_nodepools_version
|
|
33
33
|
from .system_characteristics import SystemCharacteristics
|
|
34
34
|
|
|
35
|
-
JOBSET_VERSION = 'v0.
|
|
35
|
+
JOBSET_VERSION = 'v0.8.0'
|
|
36
|
+
PATHWAYS_JOB_VERSION = 'v0.1.0'
|
|
36
37
|
INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
37
38
|
INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
38
39
|
|
|
@@ -71,6 +72,35 @@ def set_jobset_on_cluster(args) -> int:
|
|
|
71
72
|
return return_code
|
|
72
73
|
|
|
73
74
|
|
|
75
|
+
def set_pathways_job_on_cluster(args) -> int:
|
|
76
|
+
"""Add PathwaysJob command on server side and ask user to verify it is created.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
args: user provided arguments for running the command.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
0 if successful and 1 otherwise.
|
|
83
|
+
"""
|
|
84
|
+
command = (
|
|
85
|
+
'kubectl apply --server-side -f'
|
|
86
|
+
f' https://github.com/google/pathways-job/releases/download/{PATHWAYS_JOB_VERSION}/install.yaml'
|
|
87
|
+
)
|
|
88
|
+
task = f'Install PathwaysJob on {args.cluster}'
|
|
89
|
+
return_code = run_command_with_updates_retry(command, task, args)
|
|
90
|
+
|
|
91
|
+
if return_code != 0:
|
|
92
|
+
xpk_print(f'{task} returned with ERROR {return_code}.\n')
|
|
93
|
+
xpk_print(
|
|
94
|
+
"This LIKELY means you're missing Kubernetes Permissions, you can"
|
|
95
|
+
' validate this by checking if the error references permission problems'
|
|
96
|
+
' such as `requires one of ["container.*"] permission(s)`. Follow our'
|
|
97
|
+
' readme:'
|
|
98
|
+
' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
|
|
99
|
+
' instructions on how to fix these permissions.'
|
|
100
|
+
)
|
|
101
|
+
return return_code
|
|
102
|
+
|
|
103
|
+
|
|
74
104
|
def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
75
105
|
"""Install NCCL plugin on the cluster.
|
|
76
106
|
|
|
@@ -135,8 +165,48 @@ def update_cluster_with_gcpfilestore_driver_if_necessary(args) -> int:
|
|
|
135
165
|
return 0
|
|
136
166
|
|
|
137
167
|
|
|
168
|
+
def update_cluster_with_parallelstore_driver_if_necessary(args) -> int:
|
|
169
|
+
"""Updates a GKE cluster to enable Parallelstore CSI driver, if not enabled already.
|
|
170
|
+
Args:
|
|
171
|
+
args: user provided arguments for running the command.
|
|
172
|
+
Returns:
|
|
173
|
+
0 if successful and error code otherwise.
|
|
174
|
+
"""
|
|
175
|
+
if is_driver_enabled_on_cluster(args, driver='parallelstoreCsiDriver'):
|
|
176
|
+
return 0
|
|
177
|
+
cluster_update_return_code = update_gke_cluster_with_addon(
|
|
178
|
+
args, 'ParallelstoreCsiDriver'
|
|
179
|
+
)
|
|
180
|
+
if cluster_update_return_code > 0:
|
|
181
|
+
xpk_print('Updating GKE cluster to enable Parallelstore CSI driver failed!')
|
|
182
|
+
return cluster_update_return_code
|
|
183
|
+
|
|
184
|
+
return 0
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def update_cluster_with_pd_driver_if_necessary(args) -> int:
|
|
188
|
+
"""Updates a GKE cluster to enable PersistentDisk CSI driver, if not enabled already.
|
|
189
|
+
Args:
|
|
190
|
+
args: user provided arguments for running the command.
|
|
191
|
+
Returns:
|
|
192
|
+
0 if successful and error code otherwise.
|
|
193
|
+
"""
|
|
194
|
+
if is_driver_enabled_on_cluster(args, driver='gcePersistentDiskCsiDriver'):
|
|
195
|
+
return 0
|
|
196
|
+
cluster_update_return_code = update_gke_cluster_with_addon(
|
|
197
|
+
args, 'GcePersistentDiskCsiDriver'
|
|
198
|
+
)
|
|
199
|
+
if cluster_update_return_code > 0:
|
|
200
|
+
xpk_print(
|
|
201
|
+
'Updating GKE cluster to enable PersistentDisk CSI driver failed!'
|
|
202
|
+
)
|
|
203
|
+
return cluster_update_return_code
|
|
204
|
+
|
|
205
|
+
return 0
|
|
206
|
+
|
|
207
|
+
|
|
138
208
|
def is_driver_enabled_on_cluster(args, driver: str) -> bool:
|
|
139
|
-
"""Checks if
|
|
209
|
+
"""Checks if the CSI driver is enabled on the cluster.
|
|
140
210
|
Args:
|
|
141
211
|
args: user provided arguments for running the command.
|
|
142
212
|
driver (str) : name of the driver
|
|
@@ -148,14 +218,14 @@ def is_driver_enabled_on_cluster(args, driver: str) -> bool:
|
|
|
148
218
|
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
149
219
|
f' --format="value(addonsConfig.{driver}Config.enabled)"'
|
|
150
220
|
)
|
|
151
|
-
return_code,
|
|
221
|
+
return_code, driver_enabled = run_command_for_value(
|
|
152
222
|
command,
|
|
153
223
|
f'Checks if {driver} driver is enabled in cluster describe.',
|
|
154
224
|
args,
|
|
155
225
|
)
|
|
156
226
|
if return_code != 0:
|
|
157
227
|
xpk_exit(return_code)
|
|
158
|
-
if
|
|
228
|
+
if driver_enabled.strip().lower() == 'true':
|
|
159
229
|
xpk_print(f'{driver} driver is enabled on the cluster, no update needed.')
|
|
160
230
|
return True
|
|
161
231
|
return False
|
|
@@ -446,7 +516,7 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
|
|
|
446
516
|
)
|
|
447
517
|
if return_code != 0:
|
|
448
518
|
xpk_exit(return_code)
|
|
449
|
-
if gcsfuse_driver_enabled.lower() == 'true':
|
|
519
|
+
if gcsfuse_driver_enabled.strip().lower() == 'true':
|
|
450
520
|
xpk_print('GCSFuse CSI driver is enabled on the cluster, no update needed.')
|
|
451
521
|
return True
|
|
452
522
|
return False
|
xpk/core/config.py
CHANGED
|
@@ -24,7 +24,7 @@ from ..utils.console import xpk_print
|
|
|
24
24
|
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
25
25
|
|
|
26
26
|
# This is the version for XPK PyPI package
|
|
27
|
-
__version__ = 'v0.
|
|
27
|
+
__version__ = 'v0.8.0'
|
|
28
28
|
XPK_CURRENT_VERSION = __version__
|
|
29
29
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
30
30
|
|
|
@@ -39,6 +39,7 @@ KJOB_SHELL_IMAGE = 'shell-image'
|
|
|
39
39
|
KJOB_SHELL_INTERACTIVE_COMMAND = 'shell-interactive-command'
|
|
40
40
|
KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
|
|
41
41
|
CONFIGS_KEY = 'configs'
|
|
42
|
+
GKE_ENDPOINT_KEY = 'gke-endpoint'
|
|
42
43
|
DEPENDENCIES_KEY = 'deps-verified-version'
|
|
43
44
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
44
45
|
|
|
@@ -47,6 +48,7 @@ DEFAULT_KEYS = [
|
|
|
47
48
|
CLUSTER_NAME_KEY,
|
|
48
49
|
PROJECT_KEY,
|
|
49
50
|
ZONE_KEY,
|
|
51
|
+
GKE_ENDPOINT_KEY,
|
|
50
52
|
DEPENDENCIES_KEY,
|
|
51
53
|
KJOB_BATCH_IMAGE,
|
|
52
54
|
KJOB_BATCH_WORKING_DIRECTORY,
|
xpk/core/docker_manager.py
CHANGED
|
@@ -30,7 +30,7 @@ import time
|
|
|
30
30
|
DockerRunCommandExitCode = 135
|
|
31
31
|
dockerBuildErrorCode = 134
|
|
32
32
|
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
-
ctk_build_ref = "v1.
|
|
33
|
+
ctk_build_ref = "v1.48.0"
|
|
34
34
|
ctk_docker_image = "xpk-ctk"
|
|
35
35
|
ctk_container_name = "xpk-ctk-container"
|
|
36
36
|
gcloud_cfg_mount_path = "/root/.config/gcloud"
|