xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -22,32 +22,34 @@ from ruamel import yaml
|
|
|
22
22
|
|
|
23
23
|
from ...utils.console import xpk_exit, xpk_print
|
|
24
24
|
from ...utils.file import ensure_directory_exists
|
|
25
|
-
from ..capacity import
|
|
25
|
+
from ..capacity import (
|
|
26
|
+
B200_DEVICE_TYPE,
|
|
27
|
+
H100_MEGA_DEVICE_TYPE,
|
|
28
|
+
H200_DEVICE_TYPE,
|
|
29
|
+
CapacityType,
|
|
30
|
+
)
|
|
26
31
|
from ..system_characteristics import get_system_characteristics_by_device_type
|
|
27
32
|
from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
|
|
28
33
|
|
|
34
|
+
|
|
29
35
|
yaml = yaml.YAML()
|
|
30
36
|
|
|
31
37
|
a3mega_device_type = H100_MEGA_DEVICE_TYPE
|
|
32
38
|
a3ultra_device_type = H200_DEVICE_TYPE
|
|
33
|
-
|
|
39
|
+
a4_device_type = B200_DEVICE_TYPE
|
|
40
|
+
supported_device_types = {
|
|
41
|
+
a3mega_device_type,
|
|
42
|
+
a3ultra_device_type,
|
|
43
|
+
a4_device_type,
|
|
44
|
+
}
|
|
34
45
|
blueprint_dependencies_dir = {
|
|
35
46
|
a3mega_device_type: "src/xpk/blueprints/a3mega",
|
|
36
47
|
a3ultra_device_type: "src/xpk/blueprints/a3ultra",
|
|
48
|
+
a4_device_type: "src/xpk/blueprints/a4",
|
|
37
49
|
}
|
|
38
50
|
|
|
39
51
|
cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
|
|
40
|
-
cluster_toolkit_version = "v1.
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
44
|
-
return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
48
|
-
return [f"{cluster_name}-sub-1"] + [
|
|
49
|
-
f"{cluster_name}-rdma-sub-{i}" for i in range(8)
|
|
50
|
-
]
|
|
52
|
+
cluster_toolkit_version = "v1.48.0"
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
class BlueprintGeneratorOutput:
|
|
@@ -157,6 +159,11 @@ class BlueprintGenerator:
|
|
|
157
159
|
"total_min_nodes": system_node_pool_min_node_count,
|
|
158
160
|
"total_max_nodes": 1000,
|
|
159
161
|
},
|
|
162
|
+
"k8s_network_names": {
|
|
163
|
+
"gvnic_prefix": f"{cluster_name}-gpunet-",
|
|
164
|
+
"gvnic_postfix": "-subnet",
|
|
165
|
+
"gvnic_start_index": 0,
|
|
166
|
+
},
|
|
160
167
|
},
|
|
161
168
|
outputs=["instructions"],
|
|
162
169
|
)
|
|
@@ -173,13 +180,17 @@ class BlueprintGenerator:
|
|
|
173
180
|
a3_megagpu_pool_0 = DeploymentModule(
|
|
174
181
|
id="a3_megagpu_pool_0",
|
|
175
182
|
source="modules/compute/gke-node-pool",
|
|
176
|
-
use=["gke_cluster", gpu_subnets_name
|
|
183
|
+
use=["gke_cluster", gpu_subnets_name],
|
|
177
184
|
settings={
|
|
178
185
|
"name": f"{cluster_name}-a3-megagpu-pool-0",
|
|
179
186
|
"machine_type": system.gce_machine_type,
|
|
180
187
|
"static_node_count": num_nodes,
|
|
181
188
|
"zones": [zone],
|
|
182
|
-
"host_maintenance_interval":
|
|
189
|
+
"host_maintenance_interval": (
|
|
190
|
+
None
|
|
191
|
+
if capacity_type == CapacityType.RESERVATION
|
|
192
|
+
else "PERIODIC"
|
|
193
|
+
),
|
|
183
194
|
"reservation_affinity": self._getblock_reservation_affinity(
|
|
184
195
|
reservation
|
|
185
196
|
),
|
|
@@ -190,6 +201,9 @@ class BlueprintGenerator:
|
|
|
190
201
|
},
|
|
191
202
|
outputs=["instructions"],
|
|
192
203
|
)
|
|
204
|
+
|
|
205
|
+
set_placement_policy = capacity_type != CapacityType.SPOT
|
|
206
|
+
tas_name = "topologyName: 'gke-default'" if set_placement_policy else ""
|
|
193
207
|
num_chips = num_nodes * system.chips_per_vm
|
|
194
208
|
workload = DeploymentModule(
|
|
195
209
|
id="workload_component_install",
|
|
@@ -200,7 +214,10 @@ class BlueprintGenerator:
|
|
|
200
214
|
"install": True,
|
|
201
215
|
"version": "v0.10.0", # TAS feature-gates is enabled in CT
|
|
202
216
|
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
203
|
-
"config_template_vars": {
|
|
217
|
+
"config_template_vars": {
|
|
218
|
+
"num_chips": num_chips,
|
|
219
|
+
"tas_name": tas_name,
|
|
220
|
+
},
|
|
204
221
|
},
|
|
205
222
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
206
223
|
"apply_manifests": [{
|
|
@@ -236,15 +253,19 @@ class BlueprintGenerator:
|
|
|
236
253
|
primary_vpc,
|
|
237
254
|
gpunets,
|
|
238
255
|
gke_cluster,
|
|
239
|
-
group_placement_0,
|
|
240
256
|
a3_megagpu_pool_0,
|
|
241
257
|
workload,
|
|
242
258
|
workload_configmap,
|
|
243
259
|
],
|
|
244
260
|
)
|
|
261
|
+
|
|
262
|
+
if set_placement_policy:
|
|
263
|
+
a3_megagpu_pool_0.use.append(group_placement_0.id)
|
|
264
|
+
primary_group.modules.append(group_placement_0)
|
|
265
|
+
|
|
245
266
|
a3_mega_blueprint = Blueprint(
|
|
246
267
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
247
|
-
gcs_bucket, prefix
|
|
268
|
+
gcs_bucket, cluster_name, prefix
|
|
248
269
|
),
|
|
249
270
|
blueprint_name=blueprint_name,
|
|
250
271
|
toolkit_modules_url=cluster_toolkit_url,
|
|
@@ -261,8 +282,8 @@ class BlueprintGenerator:
|
|
|
261
282
|
blueprint_file_path = self._save_blueprint_to_file(
|
|
262
283
|
blueprint_name, a3_mega_blueprint, prefix
|
|
263
284
|
)
|
|
264
|
-
blueprint_dependencies = self.
|
|
265
|
-
blueprint_name, prefix
|
|
285
|
+
blueprint_dependencies = self._get_blueprint_dependencies(
|
|
286
|
+
a3mega_device_type, blueprint_name, prefix
|
|
266
287
|
)
|
|
267
288
|
xpk_print(f"Blueprint file path: {blueprint_file_path}")
|
|
268
289
|
xpk_print(
|
|
@@ -331,7 +352,7 @@ class BlueprintGenerator:
|
|
|
331
352
|
)
|
|
332
353
|
ml_gke = Blueprint(
|
|
333
354
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
334
|
-
gcs_bucket, prefix
|
|
355
|
+
gcs_bucket, cluster_name, prefix
|
|
335
356
|
),
|
|
336
357
|
blueprint_name=blueprint_name,
|
|
337
358
|
toolkit_modules_url=cluster_toolkit_url,
|
|
@@ -490,6 +511,13 @@ class BlueprintGenerator:
|
|
|
490
511
|
" alias_ip_range=[]}],"
|
|
491
512
|
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
492
513
|
),
|
|
514
|
+
"k8s_network_names": {
|
|
515
|
+
"rdma_prefix": f"{cluster_name}-rdma-sub-",
|
|
516
|
+
"rdma_start_index": 0,
|
|
517
|
+
"rdma_postfix": "",
|
|
518
|
+
"gvnic_prefix": f"{cluster_name}-sub-",
|
|
519
|
+
"gvnic_start_index": 1,
|
|
520
|
+
},
|
|
493
521
|
},
|
|
494
522
|
outputs=["instructions"],
|
|
495
523
|
)
|
|
@@ -546,7 +574,7 @@ class BlueprintGenerator:
|
|
|
546
574
|
"install": True,
|
|
547
575
|
"version": "v0.10.0", # TAS feature-gates is enabled in CT
|
|
548
576
|
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
549
|
-
"config_template_vars": {"num_chips":
|
|
577
|
+
"config_template_vars": {"num_chips": num_chips},
|
|
550
578
|
},
|
|
551
579
|
"jobset": {"install": True, "version": "v0.7.2"},
|
|
552
580
|
"apply_manifests": [
|
|
@@ -597,7 +625,7 @@ class BlueprintGenerator:
|
|
|
597
625
|
)
|
|
598
626
|
a3_ultra_blueprint = Blueprint(
|
|
599
627
|
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
600
|
-
gcs_bucket, prefix
|
|
628
|
+
gcs_bucket, cluster_name, prefix
|
|
601
629
|
),
|
|
602
630
|
blueprint_name=blueprint_name,
|
|
603
631
|
toolkit_modules_url=cluster_toolkit_url,
|
|
@@ -614,8 +642,276 @@ class BlueprintGenerator:
|
|
|
614
642
|
blueprint_file_path = self._save_blueprint_to_file(
|
|
615
643
|
blueprint_name, a3_ultra_blueprint, prefix
|
|
616
644
|
)
|
|
617
|
-
blueprint_dependencies = self.
|
|
618
|
-
blueprint_name, prefix
|
|
645
|
+
blueprint_dependencies = self._get_blueprint_dependencies(
|
|
646
|
+
a3ultra_device_type, blueprint_name, prefix
|
|
647
|
+
)
|
|
648
|
+
return BlueprintGeneratorOutput(
|
|
649
|
+
blueprint_file=blueprint_file_path,
|
|
650
|
+
blueprint_dependencies=blueprint_dependencies,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
def generate_a4_blueprint(
|
|
654
|
+
self,
|
|
655
|
+
project_id: str,
|
|
656
|
+
cluster_name: str,
|
|
657
|
+
blueprint_name: str,
|
|
658
|
+
region: str,
|
|
659
|
+
zone: str,
|
|
660
|
+
auth_cidr: str,
|
|
661
|
+
system_node_pool_machine_type: str,
|
|
662
|
+
reservation: Optional[str | None] = None,
|
|
663
|
+
gcs_bucket: Optional[str | None] = None,
|
|
664
|
+
num_nodes: int = 2,
|
|
665
|
+
prefix: str = "",
|
|
666
|
+
system_node_pool_min_node_count: int = 2,
|
|
667
|
+
capacity_type: CapacityType = CapacityType.ON_DEMAND,
|
|
668
|
+
) -> BlueprintGeneratorOutput:
|
|
669
|
+
"""Create A4 blueprint.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
Returns:
|
|
673
|
+
- Blueprint representing cluster toolkit blueprint
|
|
674
|
+
"""
|
|
675
|
+
nccl_installer_path = (
|
|
676
|
+
f'$(ghpc_stage("{blueprint_name}"))/nccl-rdma-installer-a4.yaml'
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
net_0_id = f"{cluster_name}-net-0"
|
|
680
|
+
gpu_net_0 = DeploymentModule(
|
|
681
|
+
id=net_0_id,
|
|
682
|
+
source="modules/network/vpc",
|
|
683
|
+
settings={
|
|
684
|
+
"network_name": f"{cluster_name}-net-0",
|
|
685
|
+
"mtu": 8896,
|
|
686
|
+
"subnetworks": [{
|
|
687
|
+
"subnet_name": f"{cluster_name}-sub-0",
|
|
688
|
+
"subnet_region": region,
|
|
689
|
+
"subnet_ip": "192.168.0.0/18",
|
|
690
|
+
}],
|
|
691
|
+
"secondary_ranges_list": [{
|
|
692
|
+
"subnetwork_name": f"{cluster_name}-sub-0",
|
|
693
|
+
"ranges": [
|
|
694
|
+
{"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
|
|
695
|
+
{"range_name": "services", "ip_cidr_range": "10.0.32.0/20"},
|
|
696
|
+
],
|
|
697
|
+
}],
|
|
698
|
+
"firewall_rules": [{
|
|
699
|
+
"name": f"{cluster_name}-internal-0",
|
|
700
|
+
"ranges": ["192.168.0.0/16"],
|
|
701
|
+
"allow": [
|
|
702
|
+
{"protocol": "tcp", "ports": ["0-65535"]},
|
|
703
|
+
{"protocol": "udp", "ports": ["0-65535"]},
|
|
704
|
+
{"protocol": "icmp"},
|
|
705
|
+
],
|
|
706
|
+
}],
|
|
707
|
+
},
|
|
708
|
+
)
|
|
709
|
+
net_1_id = f"{cluster_name}-net-1"
|
|
710
|
+
gpu_net_1 = DeploymentModule(
|
|
711
|
+
id=net_1_id,
|
|
712
|
+
source="modules/network/vpc",
|
|
713
|
+
settings={
|
|
714
|
+
"network_name": f"{cluster_name}-net-1",
|
|
715
|
+
"mtu": 8896,
|
|
716
|
+
"subnetworks": [{
|
|
717
|
+
"subnet_name": f"{cluster_name}-sub-1",
|
|
718
|
+
"subnet_region": region,
|
|
719
|
+
"subnet_ip": "192.168.64.0/18",
|
|
720
|
+
}],
|
|
721
|
+
"firewall_rules": [{
|
|
722
|
+
"name": f"{cluster_name}-internal-1",
|
|
723
|
+
"ranges": ["192.168.0.0/16"],
|
|
724
|
+
"allow": [
|
|
725
|
+
{"protocol": "tcp", "ports": ["0-65535"]},
|
|
726
|
+
{"protocol": "udp", "ports": ["0-65535"]},
|
|
727
|
+
{"protocol": "icmp"},
|
|
728
|
+
],
|
|
729
|
+
}],
|
|
730
|
+
},
|
|
731
|
+
)
|
|
732
|
+
rma_net_id = f"{cluster_name}-rdma-net"
|
|
733
|
+
rma_net = DeploymentModule(
|
|
734
|
+
id=rma_net_id,
|
|
735
|
+
source="modules/network/gpu-rdma-vpc",
|
|
736
|
+
settings={
|
|
737
|
+
"network_name": f"{cluster_name}-rdma-net",
|
|
738
|
+
"mtu": 8896,
|
|
739
|
+
"network_profile": f"https://www.googleapis.com/compute/beta/projects/{project_id}/global/networkProfiles/{zone}-vpc-roce",
|
|
740
|
+
"network_routing_mode": "REGIONAL",
|
|
741
|
+
"subnetworks_template": {
|
|
742
|
+
"name_prefix": f"{cluster_name}-rdma-sub",
|
|
743
|
+
"count": 8,
|
|
744
|
+
"ip_range": "192.168.128.0/18",
|
|
745
|
+
"region": region,
|
|
746
|
+
},
|
|
747
|
+
},
|
|
748
|
+
)
|
|
749
|
+
cluster_id = f"{cluster_name}-a4-cluster"
|
|
750
|
+
a4_cluster = DeploymentModule(
|
|
751
|
+
id=cluster_id,
|
|
752
|
+
source="modules/scheduler/gke-cluster",
|
|
753
|
+
use=[net_0_id],
|
|
754
|
+
settings={
|
|
755
|
+
"system_node_pool_machine_type": system_node_pool_machine_type,
|
|
756
|
+
"system_node_pool_node_count": {
|
|
757
|
+
"total_min_nodes": system_node_pool_min_node_count,
|
|
758
|
+
"total_max_nodes": 1000,
|
|
759
|
+
},
|
|
760
|
+
"prefix_with_deployment_name": False,
|
|
761
|
+
"name_suffix": cluster_name,
|
|
762
|
+
"enable_dcgm_monitoring": True,
|
|
763
|
+
"enable_gcsfuse_csi": True,
|
|
764
|
+
"enable_private_endpoint": False,
|
|
765
|
+
"master_authorized_networks": [{
|
|
766
|
+
"cidr_block": auth_cidr,
|
|
767
|
+
"display_name": "kubectl-access-network",
|
|
768
|
+
}],
|
|
769
|
+
"additional_networks": (
|
|
770
|
+
f"$(concat([{{network={cluster_name}-net-1.network_name,"
|
|
771
|
+
f" subnetwork={cluster_name}-net-1.subnetwork_name,"
|
|
772
|
+
f' subnetwork_project="{project_id}", nic_type="GVNIC",'
|
|
773
|
+
" queue_count=null, network_ip=null, stack_type=null,"
|
|
774
|
+
" access_config=[{nat_ip=null, public_ptr_domain_name=null,"
|
|
775
|
+
" network_tier=null}], ipv6_access_config=[],"
|
|
776
|
+
" alias_ip_range=[]}],"
|
|
777
|
+
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
778
|
+
),
|
|
779
|
+
"version_prefix": "1.32.",
|
|
780
|
+
"release_channel": "RAPID",
|
|
781
|
+
"maintenance_exclusions": [{
|
|
782
|
+
"name": "no-minor-or-node-upgrades-indefinite",
|
|
783
|
+
"start_time": "2024-12-01T00:00:00Z",
|
|
784
|
+
"end_time": "2025-12-22T00:00:00Z",
|
|
785
|
+
"exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
|
|
786
|
+
}],
|
|
787
|
+
},
|
|
788
|
+
outputs=["instructions"],
|
|
789
|
+
)
|
|
790
|
+
system, _ = get_system_characteristics_by_device_type(a4_device_type)
|
|
791
|
+
if system is None:
|
|
792
|
+
xpk_print(
|
|
793
|
+
"Error: Could not retrieve system characteristics for"
|
|
794
|
+
f" {a4_device_type} device_type."
|
|
795
|
+
)
|
|
796
|
+
xpk_exit(1)
|
|
797
|
+
gpu_pool = DeploymentModule(
|
|
798
|
+
id=f"{cluster_name}-a4-pool",
|
|
799
|
+
source="modules/compute/gke-node-pool",
|
|
800
|
+
use=[cluster_id],
|
|
801
|
+
settings={
|
|
802
|
+
"machine_type": system.gce_machine_type,
|
|
803
|
+
"auto_upgrade": True,
|
|
804
|
+
"zones": [zone],
|
|
805
|
+
"disk_type": "hyperdisk-balanced",
|
|
806
|
+
"static_node_count": num_nodes,
|
|
807
|
+
"local_ssd_count_ephemeral_storage": 32,
|
|
808
|
+
"spot": capacity_type == CapacityType.SPOT,
|
|
809
|
+
"reservation_affinity": self._getblock_reservation_affinity(
|
|
810
|
+
reservation
|
|
811
|
+
),
|
|
812
|
+
"max_pods_per_node": 32,
|
|
813
|
+
"guest_accelerator": [{
|
|
814
|
+
"type": system.gke_accelerator,
|
|
815
|
+
"count": 8,
|
|
816
|
+
"gpu_driver_installation_config": {
|
|
817
|
+
"gpu_driver_version": "LATEST"
|
|
818
|
+
},
|
|
819
|
+
}],
|
|
820
|
+
"additional_networks": (
|
|
821
|
+
f"$(concat([{{network={cluster_name}-net-1.network_name,"
|
|
822
|
+
f" subnetwork={cluster_name}-net-1.subnetwork_name,"
|
|
823
|
+
f' subnetwork_project="{project_id}", nic_type="GVNIC",'
|
|
824
|
+
" queue_count=null, network_ip=null, stack_type=null,"
|
|
825
|
+
" access_config=[{nat_ip=null, public_ptr_domain_name=null,"
|
|
826
|
+
" network_tier=null}], ipv6_access_config=[],"
|
|
827
|
+
" alias_ip_range=[]}],"
|
|
828
|
+
f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
|
|
829
|
+
),
|
|
830
|
+
},
|
|
831
|
+
outputs=["instructions"],
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
num_chips = num_nodes * system.chips_per_vm
|
|
835
|
+
workload_manager_install_id = "workload-manager-install"
|
|
836
|
+
workload_manager_install = DeploymentModule(
|
|
837
|
+
id=workload_manager_install_id,
|
|
838
|
+
source="modules/management/kubectl-apply",
|
|
839
|
+
use=[cluster_id],
|
|
840
|
+
settings={
|
|
841
|
+
"kueue": {
|
|
842
|
+
"install": True,
|
|
843
|
+
"version": "v0.10.0", # TAS feature-gates is enabled in CT
|
|
844
|
+
"config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
|
|
845
|
+
"config_template_vars": {"num_chips": num_chips},
|
|
846
|
+
},
|
|
847
|
+
"jobset": {"install": True, "version": "v0.7.2"},
|
|
848
|
+
"apply_manifests": [
|
|
849
|
+
{"source": nccl_installer_path},
|
|
850
|
+
{
|
|
851
|
+
"source": (
|
|
852
|
+
f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
|
|
853
|
+
)
|
|
854
|
+
},
|
|
855
|
+
],
|
|
856
|
+
},
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
workload_configmap = DeploymentModule(
|
|
860
|
+
id="workload_configmap",
|
|
861
|
+
source="modules/management/kubectl-apply",
|
|
862
|
+
use=[cluster_id],
|
|
863
|
+
settings={
|
|
864
|
+
"apply_manifests": [{
|
|
865
|
+
"source": (
|
|
866
|
+
f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
|
|
867
|
+
),
|
|
868
|
+
"template_vars": {
|
|
869
|
+
"resource_config_name": (
|
|
870
|
+
f"{cluster_name}-resources-configmap"
|
|
871
|
+
),
|
|
872
|
+
"num_nodes": f"{num_nodes}",
|
|
873
|
+
"cluster_config_name": f"{cluster_name}-metadata-configmap",
|
|
874
|
+
"capacity_type": f"{capacity_type.value}",
|
|
875
|
+
"reservation": f"{reservation}",
|
|
876
|
+
},
|
|
877
|
+
}]
|
|
878
|
+
},
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
primary_group = DeploymentGroup(
|
|
882
|
+
group="primary",
|
|
883
|
+
modules=[
|
|
884
|
+
gpu_net_0,
|
|
885
|
+
gpu_net_1,
|
|
886
|
+
rma_net,
|
|
887
|
+
a4_cluster,
|
|
888
|
+
gpu_pool,
|
|
889
|
+
workload_manager_install,
|
|
890
|
+
workload_configmap,
|
|
891
|
+
],
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
a4_blueprint = Blueprint(
|
|
895
|
+
terraform_backend_defaults=self._getblock_terraform_backend(
|
|
896
|
+
gcs_bucket, cluster_name, prefix
|
|
897
|
+
),
|
|
898
|
+
blueprint_name=blueprint_name,
|
|
899
|
+
toolkit_modules_url=cluster_toolkit_url,
|
|
900
|
+
toolkit_modules_version=cluster_toolkit_version,
|
|
901
|
+
deployment_groups=[primary_group],
|
|
902
|
+
vars={
|
|
903
|
+
"project_id": project_id,
|
|
904
|
+
"deployment_name": blueprint_name,
|
|
905
|
+
"region": region,
|
|
906
|
+
"zone": zone,
|
|
907
|
+
},
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
blueprint_file_path = self._save_blueprint_to_file(
|
|
911
|
+
blueprint_name, a4_blueprint, prefix
|
|
912
|
+
)
|
|
913
|
+
blueprint_dependencies = self._get_blueprint_dependencies(
|
|
914
|
+
a4_device_type, blueprint_name, prefix
|
|
619
915
|
)
|
|
620
916
|
return BlueprintGeneratorOutput(
|
|
621
917
|
blueprint_file=blueprint_file_path,
|
|
@@ -638,7 +934,7 @@ class BlueprintGenerator:
|
|
|
638
934
|
)
|
|
639
935
|
|
|
640
936
|
def _getblock_terraform_backend(
|
|
641
|
-
self, gcs_bucket: str, prefix: str = ""
|
|
937
|
+
self, gcs_bucket: str, cluster_name: str, prefix: str = ""
|
|
642
938
|
) -> dict | None:
|
|
643
939
|
if gcs_bucket is None:
|
|
644
940
|
return None
|
|
@@ -646,12 +942,19 @@ class BlueprintGenerator:
|
|
|
646
942
|
"type": "gcs",
|
|
647
943
|
"configuration": {
|
|
648
944
|
"bucket": gcs_bucket,
|
|
649
|
-
"prefix": self._get_terraforrm_backend_full_prefix(
|
|
945
|
+
"prefix": self._get_terraforrm_backend_full_prefix(
|
|
946
|
+
cluster_name, prefix
|
|
947
|
+
),
|
|
650
948
|
},
|
|
651
949
|
}
|
|
652
950
|
|
|
653
|
-
def _get_terraforrm_backend_full_prefix(
|
|
654
|
-
|
|
951
|
+
def _get_terraforrm_backend_full_prefix(
|
|
952
|
+
self, cluster_name: str, prefix: str = ""
|
|
953
|
+
) -> str:
|
|
954
|
+
full_prefix = "xpk_terraform_state"
|
|
955
|
+
if prefix:
|
|
956
|
+
full_prefix += f"/{prefix}"
|
|
957
|
+
return f"{full_prefix}/{cluster_name}/"
|
|
655
958
|
|
|
656
959
|
def _save_blueprint_to_file(
|
|
657
960
|
self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
|
|
@@ -676,27 +979,14 @@ class BlueprintGenerator:
|
|
|
676
979
|
blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
|
|
677
980
|
return os.path.exists(blueprint_path)
|
|
678
981
|
|
|
679
|
-
def
|
|
680
|
-
self, blueprint_name: str, prefix: str = ""
|
|
681
|
-
) -> str:
|
|
682
|
-
deployment_files_path = os.path.join(
|
|
683
|
-
self._get_storage_path(prefix), blueprint_name
|
|
684
|
-
)
|
|
685
|
-
shutil.copytree(
|
|
686
|
-
blueprint_dependencies_dir[a3mega_device_type],
|
|
687
|
-
deployment_files_path,
|
|
688
|
-
dirs_exist_ok=True,
|
|
689
|
-
)
|
|
690
|
-
return deployment_files_path
|
|
691
|
-
|
|
692
|
-
def _get_a3_ultra_blueprint_dependencies(
|
|
693
|
-
self, blueprint_name: str, prefix: str = ""
|
|
982
|
+
def _get_blueprint_dependencies(
|
|
983
|
+
self, device_type: str, blueprint_name: str, prefix: str = ""
|
|
694
984
|
) -> str:
|
|
695
985
|
deployment_files_path = os.path.join(
|
|
696
986
|
self._get_storage_path(prefix), blueprint_name
|
|
697
987
|
)
|
|
698
988
|
shutil.copytree(
|
|
699
|
-
blueprint_dependencies_dir[
|
|
989
|
+
blueprint_dependencies_dir[device_type],
|
|
700
990
|
deployment_files_path,
|
|
701
991
|
dirs_exist_ok=True,
|
|
702
992
|
)
|
xpk/core/capacity.py
CHANGED