xpk 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. xpk/commands/batch.py +19 -12
  2. xpk/commands/cluster.py +33 -16
  3. xpk/commands/cluster_gcluster.py +22 -5
  4. xpk/commands/info.py +2 -4
  5. xpk/commands/job.py +7 -8
  6. xpk/commands/kjob_common.py +23 -20
  7. xpk/commands/run.py +17 -11
  8. xpk/commands/shell.py +3 -4
  9. xpk/commands/storage.py +64 -19
  10. xpk/commands/workload.py +154 -319
  11. xpk/core/blueprint/blueprint_definitions.py +2 -0
  12. xpk/core/blueprint/blueprint_generator.py +322 -32
  13. xpk/core/capacity.py +1 -0
  14. xpk/core/cluster.py +75 -5
  15. xpk/core/config.py +3 -1
  16. xpk/core/docker_manager.py +1 -1
  17. xpk/core/docker_resources.py +9 -21
  18. xpk/core/filestore.py +11 -3
  19. xpk/core/gcsfuse.py +8 -5
  20. xpk/core/kjob.py +57 -18
  21. xpk/core/nap.py +4 -0
  22. xpk/core/network.py +11 -21
  23. xpk/core/nodepool.py +28 -26
  24. xpk/core/pathways.py +165 -210
  25. xpk/core/scheduling.py +36 -0
  26. xpk/core/storage.py +66 -12
  27. xpk/core/system_characteristics.py +9 -0
  28. xpk/core/workload.py +27 -82
  29. xpk/core/workload_decorators/rdma_decorator.py +3 -3
  30. xpk/core/workload_decorators/storage_decorator.py +8 -3
  31. xpk/core/workload_decorators/tcpxo_decorator.py +2 -2
  32. xpk/parser/cluster.py +15 -6
  33. xpk/parser/storage.py +14 -3
  34. xpk/parser/workload.py +59 -31
  35. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/METADATA +60 -4
  36. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/RECORD +40 -40
  37. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/WHEEL +1 -1
  38. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/entry_points.txt +0 -0
  39. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/licenses/LICENSE +0 -0
  40. {xpk-0.7.1.dist-info → xpk-0.8.0.dist-info}/top_level.txt +0 -0
@@ -22,22 +22,34 @@ from ruamel import yaml
22
22
 
23
23
  from ...utils.console import xpk_exit, xpk_print
24
24
  from ...utils.file import ensure_directory_exists
25
- from ..capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, CapacityType
25
+ from ..capacity import (
26
+ B200_DEVICE_TYPE,
27
+ H100_MEGA_DEVICE_TYPE,
28
+ H200_DEVICE_TYPE,
29
+ CapacityType,
30
+ )
26
31
  from ..system_characteristics import get_system_characteristics_by_device_type
27
32
  from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
28
33
 
34
+
29
35
  yaml = yaml.YAML()
30
36
 
31
37
  a3mega_device_type = H100_MEGA_DEVICE_TYPE
32
38
  a3ultra_device_type = H200_DEVICE_TYPE
33
- supported_device_types = {a3mega_device_type, a3ultra_device_type}
39
+ a4_device_type = B200_DEVICE_TYPE
40
+ supported_device_types = {
41
+ a3mega_device_type,
42
+ a3ultra_device_type,
43
+ a4_device_type,
44
+ }
34
45
  blueprint_dependencies_dir = {
35
46
  a3mega_device_type: "src/xpk/blueprints/a3mega",
36
47
  a3ultra_device_type: "src/xpk/blueprints/a3ultra",
48
+ a4_device_type: "src/xpk/blueprints/a4",
37
49
  }
38
50
 
39
51
  cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
40
- cluster_toolkit_version = "v1.45.1"
52
+ cluster_toolkit_version = "v1.48.0"
41
53
 
42
54
 
43
55
  def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
@@ -50,6 +62,10 @@ def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
50
62
  ]
51
63
 
52
64
 
65
+ def get_subnetworks_for_a4() -> list[str]:
66
+ return ["gvnic-1"] + [f"rdma-{i}" for i in range(8)]
67
+
68
+
53
69
  class BlueprintGeneratorOutput:
54
70
  """BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
55
71
  Atributes:
@@ -157,6 +173,11 @@ class BlueprintGenerator:
157
173
  "total_min_nodes": system_node_pool_min_node_count,
158
174
  "total_max_nodes": 1000,
159
175
  },
176
+ "k8s_network_names": {
177
+ "gvnic_prefix": f"{cluster_name}-gpunet-",
178
+ "gvnic_postfix": "-subnet",
179
+ "gvnic_start_index": 0,
180
+ },
160
181
  },
161
182
  outputs=["instructions"],
162
183
  )
@@ -200,7 +221,7 @@ class BlueprintGenerator:
200
221
  "install": True,
201
222
  "version": "v0.10.0", # TAS feature-gates is enabled in CT
202
223
  "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
203
- "config_template_vars": {"num_chips": f"{num_chips}"},
224
+ "config_template_vars": {"num_chips": num_chips},
204
225
  },
205
226
  "jobset": {"install": True, "version": "v0.7.2"},
206
227
  "apply_manifests": [{
@@ -244,7 +265,7 @@ class BlueprintGenerator:
244
265
  )
245
266
  a3_mega_blueprint = Blueprint(
246
267
  terraform_backend_defaults=self._getblock_terraform_backend(
247
- gcs_bucket, prefix
268
+ gcs_bucket, cluster_name, prefix
248
269
  ),
249
270
  blueprint_name=blueprint_name,
250
271
  toolkit_modules_url=cluster_toolkit_url,
@@ -261,8 +282,8 @@ class BlueprintGenerator:
261
282
  blueprint_file_path = self._save_blueprint_to_file(
262
283
  blueprint_name, a3_mega_blueprint, prefix
263
284
  )
264
- blueprint_dependencies = self._get_a3_mega_blueprint_dependencies(
265
- blueprint_name, prefix
285
+ blueprint_dependencies = self._get_blueprint_dependencies(
286
+ a3mega_device_type, blueprint_name, prefix
266
287
  )
267
288
  xpk_print(f"Blueprint file path: {blueprint_file_path}")
268
289
  xpk_print(
@@ -331,7 +352,7 @@ class BlueprintGenerator:
331
352
  )
332
353
  ml_gke = Blueprint(
333
354
  terraform_backend_defaults=self._getblock_terraform_backend(
334
- gcs_bucket, prefix
355
+ gcs_bucket, cluster_name, prefix
335
356
  ),
336
357
  blueprint_name=blueprint_name,
337
358
  toolkit_modules_url=cluster_toolkit_url,
@@ -490,6 +511,13 @@ class BlueprintGenerator:
490
511
  " alias_ip_range=[]}],"
491
512
  f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
492
513
  ),
514
+ "k8s_network_names": {
515
+ "rdma_prefix": f"{cluster_name}-rdma-sub-",
516
+ "rdma_start_index": 0,
517
+ "rdma_postfix": "",
518
+ "gvnic_prefix": f"{cluster_name}-sub-",
519
+ "gvnic_start_index": 1,
520
+ },
493
521
  },
494
522
  outputs=["instructions"],
495
523
  )
@@ -546,7 +574,7 @@ class BlueprintGenerator:
546
574
  "install": True,
547
575
  "version": "v0.10.0", # TAS feature-gates is enabled in CT
548
576
  "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
549
- "config_template_vars": {"num_chips": f"{num_chips}"},
577
+ "config_template_vars": {"num_chips": num_chips},
550
578
  },
551
579
  "jobset": {"install": True, "version": "v0.7.2"},
552
580
  "apply_manifests": [
@@ -597,7 +625,7 @@ class BlueprintGenerator:
597
625
  )
598
626
  a3_ultra_blueprint = Blueprint(
599
627
  terraform_backend_defaults=self._getblock_terraform_backend(
600
- gcs_bucket, prefix
628
+ gcs_bucket, cluster_name, prefix
601
629
  ),
602
630
  blueprint_name=blueprint_name,
603
631
  toolkit_modules_url=cluster_toolkit_url,
@@ -614,8 +642,276 @@ class BlueprintGenerator:
614
642
  blueprint_file_path = self._save_blueprint_to_file(
615
643
  blueprint_name, a3_ultra_blueprint, prefix
616
644
  )
617
- blueprint_dependencies = self._get_a3_ultra_blueprint_dependencies(
618
- blueprint_name, prefix
645
+ blueprint_dependencies = self._get_blueprint_dependencies(
646
+ a3ultra_device_type, blueprint_name, prefix
647
+ )
648
+ return BlueprintGeneratorOutput(
649
+ blueprint_file=blueprint_file_path,
650
+ blueprint_dependencies=blueprint_dependencies,
651
+ )
652
+
653
+ def generate_a4_blueprint(
654
+ self,
655
+ project_id: str,
656
+ cluster_name: str,
657
+ blueprint_name: str,
658
+ region: str,
659
+ zone: str,
660
+ auth_cidr: str,
661
+ system_node_pool_machine_type: str,
662
+ reservation: Optional[str | None] = None,
663
+ gcs_bucket: Optional[str | None] = None,
664
+ num_nodes: int = 2,
665
+ prefix: str = "",
666
+ system_node_pool_min_node_count: int = 2,
667
+ capacity_type: CapacityType = CapacityType.ON_DEMAND,
668
+ ) -> BlueprintGeneratorOutput:
669
+ """Create A4 blueprint.
670
+
671
+ Args:
672
+ Returns:
673
+ - Blueprint representing cluster toolkit blueprint
674
+ """
675
+ nccl_installer_path = (
676
+ f'$(ghpc_stage("{blueprint_name}"))/nccl-rdma-installer-a4.yaml'
677
+ )
678
+
679
+ net_0_id = f"{cluster_name}-net-0"
680
+ gpu_net_0 = DeploymentModule(
681
+ id=net_0_id,
682
+ source="modules/network/vpc",
683
+ settings={
684
+ "network_name": f"{cluster_name}-net-0",
685
+ "mtu": 8896,
686
+ "subnetworks": [{
687
+ "subnet_name": f"{cluster_name}-sub-0",
688
+ "subnet_region": region,
689
+ "subnet_ip": "192.168.0.0/18",
690
+ }],
691
+ "secondary_ranges_list": [{
692
+ "subnetwork_name": f"{cluster_name}-sub-0",
693
+ "ranges": [
694
+ {"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
695
+ {"range_name": "services", "ip_cidr_range": "10.0.32.0/20"},
696
+ ],
697
+ }],
698
+ "firewall_rules": [{
699
+ "name": f"{cluster_name}-internal-0",
700
+ "ranges": ["192.168.0.0/16"],
701
+ "allow": [
702
+ {"protocol": "tcp", "ports": ["0-65535"]},
703
+ {"protocol": "udp", "ports": ["0-65535"]},
704
+ {"protocol": "icmp"},
705
+ ],
706
+ }],
707
+ },
708
+ )
709
+ net_1_id = f"{cluster_name}-net-1"
710
+ gpu_net_1 = DeploymentModule(
711
+ id=net_1_id,
712
+ source="modules/network/vpc",
713
+ settings={
714
+ "network_name": f"{cluster_name}-net-1",
715
+ "mtu": 8896,
716
+ "subnetworks": [{
717
+ "subnet_name": f"{cluster_name}-sub-1",
718
+ "subnet_region": region,
719
+ "subnet_ip": "192.168.64.0/18",
720
+ }],
721
+ "firewall_rules": [{
722
+ "name": f"{cluster_name}-internal-1",
723
+ "ranges": ["192.168.0.0/16"],
724
+ "allow": [
725
+ {"protocol": "tcp", "ports": ["0-65535"]},
726
+ {"protocol": "udp", "ports": ["0-65535"]},
727
+ {"protocol": "icmp"},
728
+ ],
729
+ }],
730
+ },
731
+ )
732
+ rma_net_id = f"{cluster_name}-rdma-net"
733
+ rma_net = DeploymentModule(
734
+ id=rma_net_id,
735
+ source="modules/network/gpu-rdma-vpc",
736
+ settings={
737
+ "network_name": f"{cluster_name}-rdma-net",
738
+ "mtu": 8896,
739
+ "network_profile": f"https://www.googleapis.com/compute/beta/projects/{project_id}/global/networkProfiles/{zone}-vpc-roce",
740
+ "network_routing_mode": "REGIONAL",
741
+ "subnetworks_template": {
742
+ "name_prefix": f"{cluster_name}-rdma-sub",
743
+ "count": 8,
744
+ "ip_range": "192.168.128.0/18",
745
+ "region": region,
746
+ },
747
+ },
748
+ )
749
+ cluster_id = f"{cluster_name}-a4-cluster"
750
+ a4_cluster = DeploymentModule(
751
+ id=cluster_id,
752
+ source="modules/scheduler/gke-cluster",
753
+ use=[net_0_id],
754
+ settings={
755
+ "system_node_pool_machine_type": system_node_pool_machine_type,
756
+ "system_node_pool_node_count": {
757
+ "total_min_nodes": system_node_pool_min_node_count,
758
+ "total_max_nodes": 1000,
759
+ },
760
+ "prefix_with_deployment_name": False,
761
+ "name_suffix": cluster_name,
762
+ "enable_dcgm_monitoring": True,
763
+ "enable_gcsfuse_csi": True,
764
+ "enable_private_endpoint": False,
765
+ "master_authorized_networks": [{
766
+ "cidr_block": auth_cidr,
767
+ "display_name": "kubectl-access-network",
768
+ }],
769
+ "additional_networks": (
770
+ f"$(concat([{{network={cluster_name}-net-1.network_name,"
771
+ f" subnetwork={cluster_name}-net-1.subnetwork_name,"
772
+ f' subnetwork_project="{project_id}", nic_type="GVNIC",'
773
+ " queue_count=null, network_ip=null, stack_type=null,"
774
+ " access_config=[{nat_ip=null, public_ptr_domain_name=null,"
775
+ " network_tier=null}], ipv6_access_config=[],"
776
+ " alias_ip_range=[]}],"
777
+ f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
778
+ ),
779
+ "version_prefix": "1.32.",
780
+ "release_channel": "RAPID",
781
+ "maintenance_exclusions": [{
782
+ "name": "no-minor-or-node-upgrades-indefinite",
783
+ "start_time": "2024-12-01T00:00:00Z",
784
+ "end_time": "2025-12-22T00:00:00Z",
785
+ "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
786
+ }],
787
+ },
788
+ outputs=["instructions"],
789
+ )
790
+ system, _ = get_system_characteristics_by_device_type(a4_device_type)
791
+ if system is None:
792
+ xpk_print(
793
+ "Error: Could not retrieve system characteristics for"
794
+ f" {a4_device_type} device_type."
795
+ )
796
+ xpk_exit(1)
797
+ gpu_pool = DeploymentModule(
798
+ id=f"{cluster_name}-a4-pool",
799
+ source="modules/compute/gke-node-pool",
800
+ use=[cluster_id],
801
+ settings={
802
+ "machine_type": system.gce_machine_type,
803
+ "auto_upgrade": True,
804
+ "zones": [zone],
805
+ "disk_type": "hyperdisk-balanced",
806
+ "static_node_count": num_nodes,
807
+ "local_ssd_count_ephemeral_storage": 32,
808
+ "spot": capacity_type == CapacityType.SPOT,
809
+ "reservation_affinity": self._getblock_reservation_affinity(
810
+ reservation
811
+ ),
812
+ "max_pods_per_node": 32,
813
+ "guest_accelerator": [{
814
+ "type": system.gke_accelerator,
815
+ "count": 8,
816
+ "gpu_driver_installation_config": {
817
+ "gpu_driver_version": "LATEST"
818
+ },
819
+ }],
820
+ "additional_networks": (
821
+ f"$(concat([{{network={cluster_name}-net-1.network_name,"
822
+ f" subnetwork={cluster_name}-net-1.subnetwork_name,"
823
+ f' subnetwork_project="{project_id}", nic_type="GVNIC",'
824
+ " queue_count=null, network_ip=null, stack_type=null,"
825
+ " access_config=[{nat_ip=null, public_ptr_domain_name=null,"
826
+ " network_tier=null}], ipv6_access_config=[],"
827
+ " alias_ip_range=[]}],"
828
+ f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
829
+ ),
830
+ },
831
+ outputs=["instructions"],
832
+ )
833
+
834
+ num_chips = num_nodes * system.chips_per_vm
835
+ workload_manager_install_id = "workload-manager-install"
836
+ workload_manager_install = DeploymentModule(
837
+ id=workload_manager_install_id,
838
+ source="modules/management/kubectl-apply",
839
+ use=[cluster_id],
840
+ settings={
841
+ "kueue": {
842
+ "install": True,
843
+ "version": "v0.10.0", # TAS feature-gates is enabled in CT
844
+ "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
845
+ "config_template_vars": {"num_chips": num_chips},
846
+ },
847
+ "jobset": {"install": True, "version": "v0.7.2"},
848
+ "apply_manifests": [
849
+ {"source": nccl_installer_path},
850
+ {
851
+ "source": (
852
+ f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
853
+ )
854
+ },
855
+ ],
856
+ },
857
+ )
858
+
859
+ workload_configmap = DeploymentModule(
860
+ id="workload_configmap",
861
+ source="modules/management/kubectl-apply",
862
+ use=[cluster_id],
863
+ settings={
864
+ "apply_manifests": [{
865
+ "source": (
866
+ f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
867
+ ),
868
+ "template_vars": {
869
+ "resource_config_name": (
870
+ f"{cluster_name}-resources-configmap"
871
+ ),
872
+ "num_nodes": f"{num_nodes}",
873
+ "cluster_config_name": f"{cluster_name}-metadata-configmap",
874
+ "capacity_type": f"{capacity_type.value}",
875
+ "reservation": f"{reservation}",
876
+ },
877
+ }]
878
+ },
879
+ )
880
+
881
+ primary_group = DeploymentGroup(
882
+ group="primary",
883
+ modules=[
884
+ gpu_net_0,
885
+ gpu_net_1,
886
+ rma_net,
887
+ a4_cluster,
888
+ gpu_pool,
889
+ workload_manager_install,
890
+ workload_configmap,
891
+ ],
892
+ )
893
+
894
+ a4_blueprint = Blueprint(
895
+ terraform_backend_defaults=self._getblock_terraform_backend(
896
+ gcs_bucket, cluster_name, prefix
897
+ ),
898
+ blueprint_name=blueprint_name,
899
+ toolkit_modules_url=cluster_toolkit_url,
900
+ toolkit_modules_version=cluster_toolkit_version,
901
+ deployment_groups=[primary_group],
902
+ vars={
903
+ "project_id": project_id,
904
+ "deployment_name": blueprint_name,
905
+ "region": region,
906
+ "zone": zone,
907
+ },
908
+ )
909
+
910
+ blueprint_file_path = self._save_blueprint_to_file(
911
+ blueprint_name, a4_blueprint, prefix
912
+ )
913
+ blueprint_dependencies = self._get_blueprint_dependencies(
914
+ a4_device_type, blueprint_name, prefix
619
915
  )
620
916
  return BlueprintGeneratorOutput(
621
917
  blueprint_file=blueprint_file_path,
@@ -638,7 +934,7 @@ class BlueprintGenerator:
638
934
  )
639
935
 
640
936
  def _getblock_terraform_backend(
641
- self, gcs_bucket: str, prefix: str = ""
937
+ self, gcs_bucket: str, cluster_name: str, prefix: str = ""
642
938
  ) -> dict | None:
643
939
  if gcs_bucket is None:
644
940
  return None
@@ -646,12 +942,19 @@ class BlueprintGenerator:
646
942
  "type": "gcs",
647
943
  "configuration": {
648
944
  "bucket": gcs_bucket,
649
- "prefix": self._get_terraforrm_backend_full_prefix(prefix),
945
+ "prefix": self._get_terraforrm_backend_full_prefix(
946
+ cluster_name, prefix
947
+ ),
650
948
  },
651
949
  }
652
950
 
653
- def _get_terraforrm_backend_full_prefix(self, prefix: str = "") -> str:
654
- return f"xpk_terraform_state/{prefix}/tfstate/"
951
+ def _get_terraforrm_backend_full_prefix(
952
+ self, cluster_name: str, prefix: str = ""
953
+ ) -> str:
954
+ full_prefix = "xpk_terraform_state"
955
+ if prefix:
956
+ full_prefix += f"/{prefix}"
957
+ return f"{full_prefix}/{cluster_name}/"
655
958
 
656
959
  def _save_blueprint_to_file(
657
960
  self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
@@ -676,27 +979,14 @@ class BlueprintGenerator:
676
979
  blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
677
980
  return os.path.exists(blueprint_path)
678
981
 
679
- def _get_a3_mega_blueprint_dependencies(
680
- self, blueprint_name: str, prefix: str = ""
681
- ) -> str:
682
- deployment_files_path = os.path.join(
683
- self._get_storage_path(prefix), blueprint_name
684
- )
685
- shutil.copytree(
686
- blueprint_dependencies_dir[a3mega_device_type],
687
- deployment_files_path,
688
- dirs_exist_ok=True,
689
- )
690
- return deployment_files_path
691
-
692
- def _get_a3_ultra_blueprint_dependencies(
693
- self, blueprint_name: str, prefix: str = ""
982
+ def _get_blueprint_dependencies(
983
+ self, device_type: str, blueprint_name: str, prefix: str = ""
694
984
  ) -> str:
695
985
  deployment_files_path = os.path.join(
696
986
  self._get_storage_path(prefix), blueprint_name
697
987
  )
698
988
  shutil.copytree(
699
- blueprint_dependencies_dir[a3ultra_device_type],
989
+ blueprint_dependencies_dir[device_type],
700
990
  deployment_files_path,
701
991
  dirs_exist_ok=True,
702
992
  )
xpk/core/capacity.py CHANGED
@@ -27,6 +27,7 @@ CAPACITY_TYPE_CONFIG_KEY = 'capacity_type'
27
27
  H100_DEVICE_TYPE = 'h100-80gb-8'
28
28
  H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
29
29
  H200_DEVICE_TYPE = 'h200-141gb-8'
30
+ B200_DEVICE_TYPE = 'b200-8'
30
31
  RESERVATION_CONFIG_KEY = 'reservation_id'
31
32
 
32
33
 
xpk/core/cluster.py CHANGED
@@ -32,7 +32,8 @@ from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to
32
32
  from .nodepool import upgrade_gke_nodepools_version
33
33
  from .system_characteristics import SystemCharacteristics
34
34
 
35
- JOBSET_VERSION = 'v0.7.2'
35
+ JOBSET_VERSION = 'v0.8.0'
36
+ PATHWAYS_JOB_VERSION = 'v0.1.0'
36
37
  INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
37
38
  INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
38
39
 
@@ -71,6 +72,35 @@ def set_jobset_on_cluster(args) -> int:
71
72
  return return_code
72
73
 
73
74
 
75
+ def set_pathways_job_on_cluster(args) -> int:
76
+ """Add PathwaysJob command on server side and ask user to verify it is created.
77
+
78
+ Args:
79
+ args: user provided arguments for running the command.
80
+
81
+ Returns:
82
+ 0 if successful and 1 otherwise.
83
+ """
84
+ command = (
85
+ 'kubectl apply --server-side -f'
86
+ f' https://github.com/google/pathways-job/releases/download/{PATHWAYS_JOB_VERSION}/install.yaml'
87
+ )
88
+ task = f'Install PathwaysJob on {args.cluster}'
89
+ return_code = run_command_with_updates_retry(command, task, args)
90
+
91
+ if return_code != 0:
92
+ xpk_print(f'{task} returned with ERROR {return_code}.\n')
93
+ xpk_print(
94
+ "This LIKELY means you're missing Kubernetes Permissions, you can"
95
+ ' validate this by checking if the error references permission problems'
96
+ ' such as `requires one of ["container.*"] permission(s)`. Follow our'
97
+ ' readme:'
98
+ ' https://github.com/google/xpk/blob/main/README.md#troubleshooting for'
99
+ ' instructions on how to fix these permissions.'
100
+ )
101
+ return return_code
102
+
103
+
74
104
  def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
75
105
  """Install NCCL plugin on the cluster.
76
106
 
@@ -135,8 +165,48 @@ def update_cluster_with_gcpfilestore_driver_if_necessary(args) -> int:
135
165
  return 0
136
166
 
137
167
 
168
+ def update_cluster_with_parallelstore_driver_if_necessary(args) -> int:
169
+ """Updates a GKE cluster to enable Parallelstore CSI driver, if not enabled already.
170
+ Args:
171
+ args: user provided arguments for running the command.
172
+ Returns:
173
+ 0 if successful and error code otherwise.
174
+ """
175
+ if is_driver_enabled_on_cluster(args, driver='parallelstoreCsiDriver'):
176
+ return 0
177
+ cluster_update_return_code = update_gke_cluster_with_addon(
178
+ args, 'ParallelstoreCsiDriver'
179
+ )
180
+ if cluster_update_return_code > 0:
181
+ xpk_print('Updating GKE cluster to enable Parallelstore CSI driver failed!')
182
+ return cluster_update_return_code
183
+
184
+ return 0
185
+
186
+
187
+ def update_cluster_with_pd_driver_if_necessary(args) -> int:
188
+ """Updates a GKE cluster to enable PersistentDisk CSI driver, if not enabled already.
189
+ Args:
190
+ args: user provided arguments for running the command.
191
+ Returns:
192
+ 0 if successful and error code otherwise.
193
+ """
194
+ if is_driver_enabled_on_cluster(args, driver='gcePersistentDiskCsiDriver'):
195
+ return 0
196
+ cluster_update_return_code = update_gke_cluster_with_addon(
197
+ args, 'GcePersistentDiskCsiDriver'
198
+ )
199
+ if cluster_update_return_code > 0:
200
+ xpk_print(
201
+ 'Updating GKE cluster to enable PersistentDisk CSI driver failed!'
202
+ )
203
+ return cluster_update_return_code
204
+
205
+ return 0
206
+
207
+
138
208
  def is_driver_enabled_on_cluster(args, driver: str) -> bool:
139
- """Checks if GCSFuse CSI driver is enabled on the cluster.
209
+ """Checks if the CSI driver is enabled on the cluster.
140
210
  Args:
141
211
  args: user provided arguments for running the command.
142
212
  driver (str) : name of the driver
@@ -148,14 +218,14 @@ def is_driver_enabled_on_cluster(args, driver: str) -> bool:
148
218
  f' --project={args.project} --region={zone_to_region(args.zone)}'
149
219
  f' --format="value(addonsConfig.{driver}Config.enabled)"'
150
220
  )
151
- return_code, gcsfuse_driver_enabled = run_command_for_value(
221
+ return_code, driver_enabled = run_command_for_value(
152
222
  command,
153
223
  f'Checks if {driver} driver is enabled in cluster describe.',
154
224
  args,
155
225
  )
156
226
  if return_code != 0:
157
227
  xpk_exit(return_code)
158
- if gcsfuse_driver_enabled.lower() == 'true':
228
+ if driver_enabled.strip().lower() == 'true':
159
229
  xpk_print(f'{driver} driver is enabled on the cluster, no update needed.')
160
230
  return True
161
231
  return False
@@ -446,7 +516,7 @@ def is_gcsfuse_driver_enabled_on_cluster(args) -> bool:
446
516
  )
447
517
  if return_code != 0:
448
518
  xpk_exit(return_code)
449
- if gcsfuse_driver_enabled.lower() == 'true':
519
+ if gcsfuse_driver_enabled.strip().lower() == 'true':
450
520
  xpk_print('GCSFuse CSI driver is enabled on the cluster, no update needed.')
451
521
  return True
452
522
  return False
xpk/core/config.py CHANGED
@@ -24,7 +24,7 @@ from ..utils.console import xpk_print
24
24
  from .system_characteristics import AcceleratorType, SystemCharacteristics
25
25
 
26
26
  # This is the version for XPK PyPI package
27
- __version__ = 'v0.7.1'
27
+ __version__ = 'v0.8.0'
28
28
  XPK_CURRENT_VERSION = __version__
29
29
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
30
30
 
@@ -39,6 +39,7 @@ KJOB_SHELL_IMAGE = 'shell-image'
39
39
  KJOB_SHELL_INTERACTIVE_COMMAND = 'shell-interactive-command'
40
40
  KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
41
41
  CONFIGS_KEY = 'configs'
42
+ GKE_ENDPOINT_KEY = 'gke-endpoint'
42
43
  DEPENDENCIES_KEY = 'deps-verified-version'
43
44
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
44
45
 
@@ -47,6 +48,7 @@ DEFAULT_KEYS = [
47
48
  CLUSTER_NAME_KEY,
48
49
  PROJECT_KEY,
49
50
  ZONE_KEY,
51
+ GKE_ENDPOINT_KEY,
50
52
  DEPENDENCIES_KEY,
51
53
  KJOB_BATCH_IMAGE,
52
54
  KJOB_BATCH_WORKING_DIRECTORY,
@@ -30,7 +30,7 @@ import time
30
30
  DockerRunCommandExitCode = 135
31
31
  dockerBuildErrorCode = 134
32
32
  ctk_dockerfile_path = "Dockerfile"
33
- ctk_build_ref = "v1.45.1"
33
+ ctk_build_ref = "v1.48.0"
34
34
  ctk_docker_image = "xpk-ctk"
35
35
  ctk_container_name = "xpk-ctk-container"
36
36
  gcloud_cfg_mount_path = "/root/.config/gcloud"