xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +125 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.1.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,173 @@
1
+ # Copyright 2024 "Google LLC"
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ !Blueprint
15
+ blueprint_name: xpk-gke-a3-ultra
16
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
17
+ toolkit_modules_version: v1.62.2
18
+
19
+ vars:
20
+ labels: {gke_product_type: xpk}
21
+
22
+
23
+ terraform_backend_defaults:
24
+ type: gcs
25
+ configuration:
26
+ bucket: test-bucket
27
+ prefix: xpk_terraform_state/testdir/gke-a3-ultra/
28
+
29
+ deployment_groups:
30
+ - !DeploymentGroup
31
+ group: primary
32
+ modules:
33
+ - !DeploymentModule
34
+ id: gke-a3-ultra-net-0
35
+ source: modules/network/vpc
36
+ settings:
37
+ network_name: gke-a3-ultra-net-0
38
+ subnetworks:
39
+ - subnet_name: gke-a3-ultra-sub-0
40
+ subnet_region: us-central1
41
+ subnet_ip: 192.168.0.0/18
42
+ secondary_ranges_list:
43
+ - subnetwork_name : gke-a3-ultra-sub-0
44
+ ranges:
45
+ - range_name: pods
46
+ ip_cidr_range: 10.4.0.0/14
47
+ - range_name: services
48
+ ip_cidr_range: 10.0.32.0/20
49
+ firewall_rules:
50
+ - name: gke-a3-ultra-internal-0
51
+ ranges: [192.168.0.0/16]
52
+ allow:
53
+ - protocol: tcp
54
+ ports: ["0-65535"]
55
+ - protocol: udp
56
+ ports: ["0-65535"]
57
+ - protocol: icmp
58
+
59
+ - !DeploymentModule
60
+ id: gke-a3-ultra-net-1
61
+ source: modules/network/vpc
62
+ settings:
63
+ network_name: gke-a3-ultra-net-1
64
+ mtu: 8896
65
+ subnetworks:
66
+ - subnet_name: gke-a3-ultra-sub-1
67
+ subnet_region: us-central1
68
+ subnet_ip: 192.168.64.0/18
69
+ firewall_rules:
70
+ - name: gke-a3-ultra-internal-1
71
+ ranges: [192.168.0.0/16]
72
+ allow:
73
+ - protocol: tcp
74
+ ports: ["0-65535"]
75
+ - protocol: udp
76
+ ports: ["0-65535"]
77
+ - protocol: icmp
78
+
79
+ - !DeploymentModule
80
+ id: gke-a3-ultra-rdma-net
81
+ source: modules/network/gpu-rdma-vpc
82
+ settings:
83
+ network_name: gke-a3-ultra-rdma-net
84
+ mtu: 8896
85
+ network_profile: https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
86
+ network_routing_mode: REGIONAL
87
+ subnetworks_template:
88
+ name_prefix: gke-a3-ultra-rdma-sub
89
+ count: 8
90
+ ip_range: 192.168.128.0/18
91
+ region: us-central1
92
+
93
+ - !DeploymentModule
94
+ id: gke-a3-ultra-a3-ultragpu-cluster
95
+ source: modules/scheduler/gke-cluster
96
+ use: [gke-a3-ultra-net-0]
97
+ settings:
98
+ release_channel: RAPID
99
+ version_prefix: '1.2'
100
+ min_cluster_version: 1.2.3
101
+ prefix_with_deployment_name: false
102
+ name_suffix: gke-a3-ultra
103
+ system_node_pool_machine_type: "e2-standard-16"
104
+ enable_dcgm_monitoring: true
105
+ enable_gcsfuse_csi: true
106
+ enable_filestore_csi: true
107
+ enable_private_endpoint: false # Allows access from authorized public IPs
108
+ master_authorized_networks:
109
+ - cidr_block: 10.0.0.0/32 # Allows your machine to run the kubectl command. Required for multi network setup.
110
+ display_name: "kubectl-access-network"
111
+ system_node_pool_node_count:
112
+ total_min_nodes: 2
113
+ total_max_nodes: 1000
114
+ additional_networks: $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
115
+ k8s_network_names:
116
+ rdma_prefix: "gke-a3-ultra-rdma-sub-"
117
+ rdma_start_index: 0
118
+ rdma_postfix: ""
119
+ gvnic_prefix: "gke-a3-ultra-sub-"
120
+ gvnic_start_index: 1
121
+ outputs: [instructions]
122
+
123
+ - !DeploymentModule
124
+ id: gke-a3-ultra-a3u-pool
125
+ source: modules/compute/gke-node-pool
126
+ use: [gke-a3-ultra-a3-ultragpu-cluster]
127
+ settings:
128
+ machine_type: a3-ultragpu-8g
129
+ auto_upgrade: true
130
+ zones: [us-central1-c]
131
+ spot: false
132
+ max_pods_per_node: 32
133
+ guest_accelerator:
134
+ - type: nvidia-h200-141gb
135
+ count: 8
136
+ gpu_driver_installation_config:
137
+ gpu_driver_version: "LATEST"
138
+ additional_networks:
139
+ $(concat([{network=gke-a3-ultra-net-1.network_name, subnetwork=gke-a3-ultra-net-1.subnetwork_name, subnetwork_project="foo", nic_type="GVNIC", queue_count=null, network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a3-ultra-rdma-net.subnetwork_interfaces_gke))
140
+ reservation_affinity:
141
+ consume_reservation_type: SPECIFIC_RESERVATION
142
+ specific_reservations:
143
+ - name: test-reservation
144
+ static_node_count: 2
145
+ outputs: [instructions]
146
+
147
+ - !DeploymentModule
148
+ id: workload-manager-install
149
+ source: modules/management/kubectl-apply
150
+ use: [gke-a3-ultra-a3-ultragpu-cluster]
151
+ settings:
152
+ jobset:
153
+ install: true
154
+ version: v0.7.2
155
+ apply_manifests:
156
+ - source: $(ghpc_stage("xpk-gke-a3-ultra"))/nccl-installer.yaml
157
+ - source: $(ghpc_stage("xpk-gke-a3-ultra"))/mlgru-disable.yaml
158
+ - source: $(ghpc_stage("xpk-gke-a3-ultra"))/storage_crd.yaml
159
+
160
+ - !DeploymentModule
161
+ id: workload_configmap
162
+ source: 'modules/management/kubectl-apply'
163
+ use: ['gke-a3-ultra-a3-ultragpu-cluster']
164
+ settings:
165
+ apply_manifests:
166
+ - source: '$(ghpc_stage("xpk-gke-a3-ultra"))/config-map.yaml.tftpl'
167
+ template_vars: {
168
+ resource_config_name: "gke-a3-ultra-resources-configmap",
169
+ num_nodes: "2",
170
+ cluster_config_name: "gke-a3-ultra-metadata-configmap",
171
+ capacity_type: "reservation",
172
+ reservation: "test-reservation",
173
+ }
@@ -0,0 +1,185 @@
1
+ # Copyright 2024 "Google LLC"
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ !Blueprint
15
+ blueprint_name: xpk-gke-a4
16
+ toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
17
+ toolkit_modules_version: v1.62.2
18
+
19
+ vars:
20
+ labels: {gke_product_type: xpk}
21
+
22
+
23
+ terraform_backend_defaults:
24
+ type: gcs
25
+ configuration:
26
+ bucket: test-bucket
27
+ prefix: xpk_terraform_state/testdir/gke-a4/
28
+
29
+ deployment_groups:
30
+ - !DeploymentGroup
31
+ modules:
32
+ - !DeploymentModule
33
+ id: gke-a4-net-0
34
+ source: modules/network/vpc
35
+ settings:
36
+ network_name: gke-a4-net-0
37
+ mtu: 8896
38
+ subnetworks:
39
+ - subnet_name: gke-a4-sub-0
40
+ subnet_region: us-central1
41
+ subnet_ip: 192.168.0.0/18
42
+ secondary_ranges_list:
43
+ - subnetwork_name: gke-a4-sub-0
44
+ ranges:
45
+ - range_name: pods
46
+ ip_cidr_range: 10.4.0.0/14
47
+ - range_name: services
48
+ ip_cidr_range: 10.0.32.0/20
49
+ firewall_rules:
50
+ - name: gke-a4-internal-0
51
+ ranges:
52
+ - 192.168.0.0/16
53
+ allow:
54
+ - protocol: tcp
55
+ ports:
56
+ - 0-65535
57
+ - protocol: udp
58
+ ports:
59
+ - 0-65535
60
+ - protocol: icmp
61
+
62
+ - !DeploymentModule
63
+ id: gke-a4-net-1
64
+ source: modules/network/vpc
65
+ settings:
66
+ network_name: gke-a4-net-1
67
+ mtu: 8896
68
+ subnetworks:
69
+ - subnet_name: gke-a4-sub-1
70
+ subnet_region: us-central1
71
+ subnet_ip: 192.168.64.0/18
72
+ firewall_rules:
73
+ - name: gke-a4-internal-1
74
+ ranges:
75
+ - 192.168.0.0/16
76
+ allow:
77
+ - protocol: tcp
78
+ ports:
79
+ - 0-65535
80
+ - protocol: udp
81
+ ports:
82
+ - 0-65535
83
+ - protocol: icmp
84
+
85
+ - !DeploymentModule
86
+ id: gke-a4-rdma-net
87
+ source: modules/network/gpu-rdma-vpc
88
+ settings:
89
+ network_name: gke-a4-rdma-net
90
+ mtu: 8896
91
+ network_profile:
92
+ https://www.googleapis.com/compute/beta/projects/foo/global/networkProfiles/us-central1-c-vpc-roce
93
+ network_routing_mode: REGIONAL
94
+ subnetworks_template:
95
+ name_prefix: gke-a4-rdma-sub
96
+ count: 8
97
+ ip_range: 192.168.128.0/18
98
+ region: us-central1
99
+
100
+ - !DeploymentModule
101
+ id: gke-a4-a4-cluster
102
+ source: modules/scheduler/gke-cluster
103
+ outputs:
104
+ - instructions
105
+ settings:
106
+ system_node_pool_machine_type: e2-standard-16
107
+ system_node_pool_node_count:
108
+ total_min_nodes: 2
109
+ total_max_nodes: 1000
110
+ prefix_with_deployment_name: false
111
+ name_suffix: gke-a4
112
+ enable_dcgm_monitoring: true
113
+ enable_gcsfuse_csi: true
114
+ enable_private_endpoint: false
115
+ master_authorized_networks:
116
+ - cidr_block: 10.0.0.0/32
117
+ display_name: kubectl-access-network
118
+ additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
119
+ subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
120
+ network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
121
+ network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
122
+ release_channel: RAPID
123
+ version_prefix: '1.2'
124
+ min_cluster_version: 1.2.3
125
+ use:
126
+ - gke-a4-net-0
127
+ - !DeploymentModule
128
+ id: gke-a4-a4-pool
129
+ source: modules/compute/gke-node-pool
130
+ use:
131
+ - gke-a4-a4-cluster
132
+ outputs:
133
+ - instructions
134
+ settings:
135
+ machine_type: a4-highgpu-8g
136
+ auto_upgrade: true
137
+ zones:
138
+ - us-central1-c
139
+ disk_type: hyperdisk-balanced
140
+ local_ssd_count_ephemeral_storage: 32
141
+ spot: false
142
+ reservation_affinity:
143
+ consume_reservation_type: SPECIFIC_RESERVATION
144
+ specific_reservations:
145
+ - name: test-reservation
146
+ max_pods_per_node: 32
147
+ guest_accelerator:
148
+ - type: nvidia-b200
149
+ count: 8
150
+ gpu_driver_installation_config:
151
+ gpu_driver_version: LATEST
152
+ additional_networks: $(concat([{network=gke-a4-net-1.network_name, subnetwork=gke-a4-net-1.subnetwork_name,
153
+ subnetwork_project="foo", nic_type="GVNIC", queue_count=null,
154
+ network_ip=null, stack_type=null, access_config=[{nat_ip=null, public_ptr_domain_name=null,
155
+ network_tier=null}], ipv6_access_config=[], alias_ip_range=[]}], gke-a4-rdma-net.subnetwork_interfaces_gke))
156
+ static_node_count: 2
157
+
158
+ - !DeploymentModule
159
+ id: workload-manager-install
160
+ source: modules/management/kubectl-apply
161
+ use:
162
+ - gke-a4-a4-cluster
163
+ settings:
164
+ jobset:
165
+ install: true
166
+ version: v0.7.2
167
+ apply_manifests:
168
+ - source: $(ghpc_stage("xpk-gke-a4"))/nccl-rdma-installer-a4.yaml
169
+ - source: $(ghpc_stage("xpk-gke-a4"))/storage_crd.yaml
170
+
171
+ - !DeploymentModule
172
+ id: workload_configmap
173
+ source: modules/management/kubectl-apply
174
+ settings:
175
+ apply_manifests:
176
+ - source: $(ghpc_stage("xpk-gke-a4"))/config-map.yaml.tftpl
177
+ template_vars:
178
+ resource_config_name: gke-a4-resources-configmap
179
+ num_nodes: '2'
180
+ cluster_config_name: gke-a4-metadata-configmap
181
+ capacity_type: reservation
182
+ reservation: test-reservation
183
+ use:
184
+ - gke-a4-a4-cluster
185
+ group: primary
xpk/core/capacity.py CHANGED
@@ -29,6 +29,8 @@ H100_DEVICE_TYPE = 'h100-80gb-8'
29
29
  H100_MEGA_DEVICE_TYPE = 'h100-mega-80gb-8'
30
30
  H200_DEVICE_TYPE = 'h200-141gb-8'
31
31
  B200_DEVICE_TYPE = 'b200-8'
32
+ GB200_DEVICE_TYPE = 'gb200-4'
33
+ GB200_DEVICE_TYPE_NOLSSD = 'gb200-4-no-ssd'
32
34
  RESERVATION_CONFIG_KEY = 'reservation_id'
33
35
 
34
36
 
xpk/core/cluster.py CHANGED
@@ -22,7 +22,7 @@ from kubernetes import config
22
22
  from kubernetes.client.exceptions import ApiException
23
23
 
24
24
  from ..utils.console import xpk_exit, xpk_print
25
- from .capacity import B200_DEVICE_TYPE, H100_DEVICE_TYPE, H200_DEVICE_TYPE
25
+ from .capacity import H200_DEVICE_TYPE
26
26
  from .commands import (
27
27
  run_command_for_value,
28
28
  run_command_with_updates,
@@ -34,16 +34,11 @@ from .gcloud_context import (
34
34
  zone_to_region,
35
35
  )
36
36
  from .resources import get_cluster_system_characteristics
37
- from .system_characteristics import SystemCharacteristics
37
+ from .system_characteristics import INSTALLER_NCCL_TCPXO, SystemCharacteristics
38
38
 
39
39
  JOBSET_VERSION = 'v0.8.0'
40
40
  PATHWAYS_JOB_VERSION = 'v0.1.4'
41
- INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
42
- INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
43
- INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
44
- CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
45
41
  NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
46
- MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
47
42
 
48
43
  DEFAULT_NAMESPACE = 'default'
49
44
  XPK_SA = 'xpk-sa'
@@ -118,12 +113,12 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
118
113
  Returns:
119
114
  0 if successful and 1 otherwise.
120
115
  """
121
- if system.device_type == H100_DEVICE_TYPE:
122
- command = f'kubectl apply -f {INSTALLER_NCCL_TCPX}'
123
- elif system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
124
- command = f'kubectl apply -f {INSTALLER_NCCL_RDMA}'
125
- else:
126
- command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
116
+ nccl_installer = (
117
+ system.gpu_config.nccl_installer
118
+ if system.gpu_config and system.gpu_config.nccl_installer
119
+ else INSTALLER_NCCL_TCPXO
120
+ )
121
+ command = f'kubectl apply -f {nccl_installer}'
127
122
 
128
123
  return_code = run_command_with_updates(
129
124
  command, 'Install NCCL Plugin On Cluster'
@@ -135,35 +130,6 @@ def install_nccl_on_cluster(system: SystemCharacteristics) -> int:
135
130
  )
136
131
  return 1
137
132
 
138
- if system.device_type == H100_DEVICE_TYPE:
139
- command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
140
-
141
- return_code = run_command_with_updates(
142
- command, 'Install NCCL Config On Cluster'
143
- )
144
-
145
- if return_code != 0:
146
- xpk_print(
147
- f'Install NCCL Config On Cluster request returned ERROR {return_code}'
148
- )
149
- return 1
150
-
151
- return 0
152
-
153
-
154
- def disable_mglru_on_cluster() -> int:
155
- """Disable MGLRU on the cluster.
156
-
157
- Returns:
158
- 0 if successful and 1 otherwise.
159
- """
160
- command = f'kubectl apply -f {MGLRU_DISABLE}'
161
- return_code = run_command_with_updates(command, 'Disable MGLRU On Cluster')
162
-
163
- if return_code != 0:
164
- xpk_print('Disablig MGLRU On Cluster request returned ERROR')
165
- return 1
166
-
167
133
  return 0
168
134
 
169
135
 
@@ -309,10 +275,11 @@ def update_cluster_with_lustre_driver_if_necessary(args) -> int:
309
275
  Returns:
310
276
  0 if successful and error code otherwise.
311
277
  """
312
- if is_driver_enabled_on_cluster(
313
- args, driver='lustreCsiDriver'
314
- ) and is_driver_enabled_on_cluster(
315
- args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
278
+ if is_driver_enabled_on_cluster(args, driver='lustreCsiDriver') and (
279
+ not args.enable_legacy_lustre_port
280
+ or is_driver_enabled_on_cluster(
281
+ args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
282
+ )
316
283
  ):
317
284
  return 0
318
285
  cluster_update_return_code = update_gke_cluster_with_lustre_driver_enabled(
@@ -621,9 +588,13 @@ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
621
588
  """
622
589
  command = (
623
590
  'gcloud container clusters update'
624
- f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)} --enable-legacy-lustre-port'
591
+ f' {args.cluster} --project={args.project} --location={get_cluster_location(args.project, args.cluster, args.zone)}'
625
592
  ' --quiet'
626
593
  )
594
+ if args.enable_legacy_lustre_port:
595
+ command += ' --enable-legacy-lustre-port'
596
+ else:
597
+ command += ' --update-addons=LustreCsiDriver=ENABLED'
627
598
  xpk_print(
628
599
  'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
629
600
  )
xpk/core/cluster_test.py CHANGED
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  import pytest
18
18
  from .testing.commands_tester import CommandsTester
19
- from .cluster import get_cluster_credentials
19
+ from .cluster import get_cluster_credentials, update_gke_cluster_with_lustre_driver_enabled, update_cluster_with_lustre_driver_if_necessary
20
20
  from pytest_mock import MockerFixture
21
21
 
22
22
 
@@ -93,3 +93,78 @@ def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
93
93
  if "dns-endpoint" not in c
94
94
  ]
95
95
  assert len(non_dns_endpoint_commands) == 1
96
+
97
+
98
+ def test_update_cluster_with_lustre_driver_if_necessary_with_default_port_runs_correct_checks(
99
+ commands_tester: CommandsTester, command_args
100
+ ):
101
+ commands_tester.set_result_for_command(
102
+ (0, "True"),
103
+ "gcloud container clusters describe",
104
+ )
105
+ command_args.enable_legacy_lustre_port = None
106
+ update_cluster_with_lustre_driver_if_necessary(command_args)
107
+
108
+ executed_commands = commands_tester.get_matching_commands()
109
+ assert executed_commands == [
110
+ "gcloud container clusters describe cluster --project=project"
111
+ " --location=us-central1"
112
+ ' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
113
+ ]
114
+
115
+
116
+ def test_update_cluster_with_lustre_driver_if_necessary_with_legacy_port_runs_correct_checks(
117
+ commands_tester: CommandsTester, command_args
118
+ ):
119
+ commands_tester.set_result_for_command(
120
+ (0, "True"),
121
+ "gcloud container clusters describe",
122
+ )
123
+ command_args.enable_legacy_lustre_port = True
124
+ update_cluster_with_lustre_driver_if_necessary(command_args)
125
+
126
+ executed_commands = commands_tester.get_matching_commands()
127
+ assert executed_commands == [
128
+ (
129
+ "gcloud container clusters describe cluster --project=project"
130
+ " --location=us-central1"
131
+ ' --format="value(addonsConfig.lustreCsiDriverConfig.enabled)"'
132
+ ),
133
+ (
134
+ "gcloud container clusters describe cluster --project=project"
135
+ " --location=us-central1"
136
+ ' --format="value(addonsConfig.lustreCsiDriverConfig.enableLegacyLustrePort)"'
137
+ ),
138
+ ]
139
+
140
+
141
+ def test_update_gke_cluster_with_lustre_driver_enabled_default_port(
142
+ commands_tester: CommandsTester, command_args
143
+ ):
144
+ commands_tester.set_result_for_command(
145
+ (0, ""), "gcloud container clusters update"
146
+ )
147
+ command_args.enable_legacy_lustre_port = None
148
+ update_gke_cluster_with_lustre_driver_enabled(command_args)
149
+
150
+ executed_commands = commands_tester.get_matching_commands()
151
+ assert executed_commands == [
152
+ "gcloud container clusters update cluster --project=project"
153
+ " --location=us-central1 --quiet --update-addons=LustreCsiDriver=ENABLED"
154
+ ]
155
+
156
+
157
+ def test_update_gke_cluster_with_lustre_driver_enabled_legacy_port(
158
+ commands_tester: CommandsTester, command_args
159
+ ):
160
+ commands_tester.set_result_for_command(
161
+ (0, ""), "gcloud container clusters update"
162
+ )
163
+ command_args.enable_legacy_lustre_port = True
164
+ update_gke_cluster_with_lustre_driver_enabled(command_args)
165
+
166
+ executed_commands = commands_tester.get_matching_commands()
167
+ assert executed_commands == [
168
+ "gcloud container clusters update cluster --project=project"
169
+ " --location=us-central1 --quiet --enable-legacy-lustre-port"
170
+ ]
xpk/core/config.py CHANGED
@@ -17,12 +17,32 @@ limitations under the License.
17
17
  import os
18
18
 
19
19
  import ruamel.yaml
20
-
20
+ from abc import ABC, abstractmethod
21
21
  from ..utils import file
22
22
  from ..utils.console import xpk_print
23
+ from setuptools_scm import get_version as setuptools_get_version
24
+ from importlib.metadata import version, PackageNotFoundError
25
+
26
+
27
+ def _get_version() -> str:
28
+ xpk_version_override = os.getenv('XPK_VERSION_OVERRIDE', '')
29
+ if xpk_version_override != '':
30
+ return xpk_version_override
31
+
32
+ try:
33
+ return setuptools_get_version()
34
+ except LookupError:
35
+ pass
36
+
37
+ try:
38
+ return version('xpk')
39
+ except PackageNotFoundError:
40
+ pass
23
41
 
24
- # This is the version for XPK PyPI package
25
- __version__ = 'v0.15.0'
42
+ raise LookupError('unable to determine version number')
43
+
44
+
45
+ __version__ = _get_version()
26
46
  XPK_CURRENT_VERSION = __version__
27
47
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
48
 
@@ -41,7 +61,6 @@ KJOB_SHELL_WORKING_DIRECTORY = 'shell-working-directory'
41
61
  CONFIGS_KEY = 'configs'
42
62
  GKE_ENDPOINT_KEY = 'gke-endpoint'
43
63
  DEPENDENCIES_KEY = 'deps-verified-version'
44
- XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
45
64
 
46
65
  DEFAULT_KEYS = [
47
66
  CFG_BUCKET_KEY,
@@ -64,8 +83,28 @@ VERTEX_TENSORBOARD_FEATURE_FLAG = XPK_CURRENT_VERSION >= '0.4.0'
64
83
  yaml = ruamel.yaml.YAML()
65
84
 
66
85
 
67
- class XpkConfig:
68
- """XpkConfig is a class for setting and getting values from .yaml config file."""
86
+ class Config(ABC):
87
+ """Stores and manipulates XPK configuration."""
88
+
89
+ @abstractmethod
90
+ def set(self, key: str, value: str | None) -> None:
91
+ """Sets the config value"""
92
+ pass
93
+
94
+ @abstractmethod
95
+ def get(self, key: str) -> str | None:
96
+ """Reads the config value"""
97
+ pass
98
+
99
+ @abstractmethod
100
+ def get_all(
101
+ self,
102
+ ) -> dict[str, str] | None:
103
+ pass
104
+
105
+
106
+ class FileSystemConfig(Config):
107
+ """XPK Configuration manipulation class leveraging the file system."""
69
108
 
70
109
  def __init__(self, custom_config_file: str = XPK_CONFIG_FILE) -> None:
71
110
  self._config = custom_config_file
@@ -120,4 +159,39 @@ class XpkConfig:
120
159
  return val
121
160
 
122
161
 
123
- xpk_config = XpkConfig()
162
+ class InMemoryXpkConfig(Config):
163
+ """XPK Configuration manipulation class in memory."""
164
+
165
+ def __init__(self) -> None:
166
+ self._config: dict[str, str] = {}
167
+ self._allowed_keys = DEFAULT_KEYS
168
+
169
+ def set(self, key: str, value: str | None) -> None:
170
+ if key not in self._allowed_keys:
171
+ return
172
+ if value is None:
173
+ self._config.pop(key, None)
174
+ else:
175
+ self._config[key] = value
176
+
177
+ def get(self, key: str) -> str | None:
178
+ if key not in self._allowed_keys:
179
+ return None
180
+ return self._config.get(key)
181
+
182
+ def get_all(
183
+ self,
184
+ ) -> dict[str, str] | None:
185
+ return None if len(self._config) <= 0 else self._config
186
+
187
+
188
+ _xpk_config: Config = InMemoryXpkConfig()
189
+
190
+
191
+ def set_config(config: Config):
192
+ global _xpk_config
193
+ _xpk_config = config
194
+
195
+
196
+ def get_config() -> Config:
197
+ return _xpk_config