xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +5 -6
- xpk/commands/cluster.py +246 -73
- xpk/commands/cluster_gcluster.py +27 -0
- xpk/commands/common.py +40 -1
- xpk/commands/kjob_common.py +13 -1
- xpk/commands/run.py +4 -5
- xpk/commands/shell.py +2 -2
- xpk/commands/storage.py +24 -6
- xpk/commands/workload.py +66 -27
- xpk/core/blueprint/blueprint_generator.py +115 -47
- xpk/core/capacity.py +66 -6
- xpk/core/cluster.py +282 -13
- xpk/core/config.py +1 -65
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +145 -72
- xpk/core/filestore.py +2 -6
- xpk/core/gcsfuse.py +22 -4
- xpk/core/jobset.py +143 -0
- xpk/core/kjob.py +21 -18
- xpk/core/kueue.py +194 -4
- xpk/core/mtc.py +195 -0
- xpk/core/network.py +23 -1
- xpk/core/nodepool.py +17 -4
- xpk/core/pathways.py +2 -3
- xpk/core/resources.py +21 -0
- xpk/core/storage.py +1 -95
- xpk/core/system_characteristics.py +1 -1
- xpk/core/workload.py +1 -45
- xpk/core/workload_decorators/rdma_decorator.py +8 -10
- xpk/core/workload_decorators/tcpx_decorator.py +185 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
- xpk/parser/cluster.py +589 -389
- xpk/parser/storage.py +12 -3
- xpk/parser/workload.py +21 -3
- xpk/utils/kubectl.py +4 -1
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/core/capacity.py
CHANGED
|
@@ -16,8 +16,8 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
import enum
|
|
18
18
|
|
|
19
|
-
from ..utils.console import xpk_print
|
|
20
|
-
from .commands import run_command_with_updates
|
|
19
|
+
from ..utils.console import xpk_print, xpk_exit
|
|
20
|
+
from .commands import run_command_with_updates, run_command_for_value
|
|
21
21
|
|
|
22
22
|
AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
|
|
23
23
|
AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
|
|
@@ -36,6 +36,7 @@ class CapacityType(enum.Enum):
|
|
|
36
36
|
RESERVATION = 'reservation'
|
|
37
37
|
SPOT = 'spot'
|
|
38
38
|
UNKNOWN = 'unknown'
|
|
39
|
+
FLEX_START = 'flex_start'
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
def print_reservations(args) -> int:
|
|
@@ -84,6 +85,9 @@ def get_capacity_type(args) -> tuple[CapacityType, int]:
|
|
|
84
85
|
if args.spot:
|
|
85
86
|
capacity_type = CapacityType.SPOT
|
|
86
87
|
num_types += 1
|
|
88
|
+
if args.flex:
|
|
89
|
+
capacity_type = CapacityType.FLEX_START
|
|
90
|
+
num_types += 1
|
|
87
91
|
|
|
88
92
|
# Check that the number of user arguments provided is valid.
|
|
89
93
|
if num_types == 0:
|
|
@@ -91,14 +95,62 @@ def get_capacity_type(args) -> tuple[CapacityType, int]:
|
|
|
91
95
|
elif num_types != 1:
|
|
92
96
|
xpk_print(
|
|
93
97
|
'ERROR: User specified more than one of the following arguments. Please'
|
|
94
|
-
' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand
|
|
95
|
-
' or `--spot`.'
|
|
98
|
+
' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`,'
|
|
99
|
+
' `--flex` or `--spot`.'
|
|
96
100
|
)
|
|
97
101
|
return_code = 1
|
|
98
102
|
|
|
99
103
|
return capacity_type, return_code
|
|
100
104
|
|
|
101
105
|
|
|
106
|
+
def get_reservation_maintenance_interval(
|
|
107
|
+
reservation: str, zone: str, project: str
|
|
108
|
+
) -> str:
|
|
109
|
+
"""Get reservation maintenance interval.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
args: user provided arguments for running the command.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
0 if successful and 1 otherwise.
|
|
116
|
+
"""
|
|
117
|
+
command = (
|
|
118
|
+
f'gcloud beta compute reservations describe {reservation}'
|
|
119
|
+
f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
|
|
120
|
+
)
|
|
121
|
+
return_code, output = run_command_for_value(
|
|
122
|
+
command, 'Get reservation maintenance interval', None
|
|
123
|
+
)
|
|
124
|
+
if return_code != 0:
|
|
125
|
+
xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
|
|
126
|
+
xpk_exit(1)
|
|
127
|
+
return output.strip()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_reservation_placement_policy(
|
|
131
|
+
reservation: str, zone: str, project: str
|
|
132
|
+
) -> str:
|
|
133
|
+
"""Get reservation placement policy.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
args: user provided arguments for running the command.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
0 if successful and 1 otherwise.
|
|
140
|
+
"""
|
|
141
|
+
command = (
|
|
142
|
+
f'gcloud beta compute reservations describe {reservation}'
|
|
143
|
+
f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
|
|
144
|
+
)
|
|
145
|
+
return_code, output = run_command_for_value(
|
|
146
|
+
command, 'Get reservation placement policy', None
|
|
147
|
+
)
|
|
148
|
+
if return_code != 0:
|
|
149
|
+
xpk_print(f'Get reservation placement policy ERROR {return_code}')
|
|
150
|
+
xpk_exit(1)
|
|
151
|
+
return output.strip()
|
|
152
|
+
|
|
153
|
+
|
|
102
154
|
def verify_reservation_exists(args) -> int:
|
|
103
155
|
"""Verify the reservation exists.
|
|
104
156
|
|
|
@@ -121,9 +173,9 @@ def verify_reservation_exists(args) -> int:
|
|
|
121
173
|
|
|
122
174
|
|
|
123
175
|
def get_capacity_arguments_from_capacity_type(
|
|
124
|
-
args, capacity_type: CapacityType
|
|
176
|
+
args, capacity_type: CapacityType, max_nodes: int
|
|
125
177
|
) -> tuple[str, int]:
|
|
126
|
-
"""Determine the
|
|
178
|
+
"""Determine the Nodepool creation capacity arguments needed.
|
|
127
179
|
|
|
128
180
|
Args:
|
|
129
181
|
args: user provided arguments for running the command.
|
|
@@ -141,6 +193,12 @@ def get_capacity_arguments_from_capacity_type(
|
|
|
141
193
|
capacity_args = ''
|
|
142
194
|
case CapacityType.SPOT:
|
|
143
195
|
capacity_args = '--spot'
|
|
196
|
+
case CapacityType.FLEX_START:
|
|
197
|
+
capacity_args = (
|
|
198
|
+
' --flex-start --enable-queued-provisioning --enable-autoscaling'
|
|
199
|
+
' --location-policy=ANY --reservation-affinity=none'
|
|
200
|
+
f' --no-enable-autorepair --max-nodes={max_nodes}'
|
|
201
|
+
)
|
|
144
202
|
case CapacityType.RESERVATION:
|
|
145
203
|
capacity_args = (
|
|
146
204
|
f'--reservation-affinity=specific --reservation={args.reservation}'
|
|
@@ -173,6 +231,8 @@ def get_capacity_node_selectors_from_capacity_type(
|
|
|
173
231
|
match capacity_type:
|
|
174
232
|
case CapacityType.ON_DEMAND.name:
|
|
175
233
|
node_selector = ''
|
|
234
|
+
case CapacityType.FLEX_START.name:
|
|
235
|
+
node_selector = 'cloud.google.com/gke-queued="true"'
|
|
176
236
|
case CapacityType.SPOT.name:
|
|
177
237
|
node_selector = 'cloud.google.com/gke-spot="true"'
|
|
178
238
|
case CapacityType.RESERVATION.name:
|
xpk/core/cluster.py
CHANGED
|
@@ -14,28 +14,37 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
import yaml
|
|
17
18
|
from google.api_core.exceptions import PermissionDenied
|
|
18
19
|
from google.cloud import resourcemanager_v3
|
|
19
20
|
from kubernetes import client as k8s_client
|
|
20
21
|
from kubernetes import config
|
|
21
22
|
from kubernetes.client.exceptions import ApiException
|
|
22
|
-
from .resources import get_cluster_system_characteristics
|
|
23
23
|
|
|
24
24
|
from ..utils.console import xpk_exit, xpk_print
|
|
25
|
-
from .capacity import H100_DEVICE_TYPE
|
|
25
|
+
from .capacity import B200_DEVICE_TYPE, H100_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
26
26
|
from .commands import (
|
|
27
27
|
run_command_for_value,
|
|
28
28
|
run_command_with_updates,
|
|
29
29
|
run_command_with_updates_retry,
|
|
30
30
|
)
|
|
31
|
-
from .gcloud_context import
|
|
31
|
+
from .gcloud_context import (
|
|
32
|
+
add_zone_and_project,
|
|
33
|
+
get_gke_server_config,
|
|
34
|
+
zone_to_region,
|
|
35
|
+
)
|
|
32
36
|
from .nodepool import upgrade_gke_nodepools_version
|
|
37
|
+
from .resources import get_cluster_system_characteristics
|
|
33
38
|
from .system_characteristics import SystemCharacteristics
|
|
34
39
|
|
|
35
40
|
JOBSET_VERSION = 'v0.8.0'
|
|
36
|
-
PATHWAYS_JOB_VERSION = 'v0.1.
|
|
37
|
-
|
|
38
|
-
|
|
41
|
+
PATHWAYS_JOB_VERSION = 'v0.1.2'
|
|
42
|
+
INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
|
|
43
|
+
INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
|
|
44
|
+
INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
|
|
45
|
+
CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
|
|
46
|
+
NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
|
|
47
|
+
MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
|
|
39
48
|
|
|
40
49
|
DEFAULT_NAMESPACE = 'default'
|
|
41
50
|
XPK_SA = 'xpk-sa'
|
|
@@ -112,9 +121,11 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
112
121
|
0 if successful and 1 otherwise.
|
|
113
122
|
"""
|
|
114
123
|
if system.device_type == H100_DEVICE_TYPE:
|
|
115
|
-
command = f'kubectl apply -f {
|
|
124
|
+
command = f'kubectl apply -f {INSTALLER_NCCL_TCPX}'
|
|
125
|
+
elif system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
|
|
126
|
+
command = f'kubectl apply -f {INSTALLER_NCCL_RDMA}'
|
|
116
127
|
else:
|
|
117
|
-
command = f'kubectl apply -f {
|
|
128
|
+
command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
|
|
118
129
|
|
|
119
130
|
return_code = run_command_with_updates(
|
|
120
131
|
command, 'Install NCCL Plugin On Cluster', args
|
|
@@ -126,9 +137,108 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
|
126
137
|
)
|
|
127
138
|
return 1
|
|
128
139
|
|
|
140
|
+
if system.device_type == H100_DEVICE_TYPE:
|
|
141
|
+
command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
|
|
142
|
+
|
|
143
|
+
return_code = run_command_with_updates(
|
|
144
|
+
command, 'Install NCCL Config On Cluster', args
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if return_code != 0:
|
|
148
|
+
xpk_print(
|
|
149
|
+
f'Install NCCL Config On Cluster request returned ERROR {return_code}'
|
|
150
|
+
)
|
|
151
|
+
return 1
|
|
152
|
+
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def disable_mglru_on_cluster(args) -> int:
|
|
157
|
+
"""Disable MGLRU on the cluster.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
args: user provided arguments for running the command.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
0 if successful and 1 otherwise.
|
|
164
|
+
"""
|
|
165
|
+
command = f'kubectl apply -f {MGLRU_DISABLE}'
|
|
166
|
+
return_code = run_command_with_updates(
|
|
167
|
+
command, 'Disable MGLRU On Cluster', args
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if return_code != 0:
|
|
171
|
+
xpk_print('Disablig MGLRU On Cluster request returned ERROR')
|
|
172
|
+
return 1
|
|
173
|
+
|
|
174
|
+
return 0
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def install_nri_on_cluster(args) -> int:
|
|
178
|
+
"""Install NRI Device Injector on the cluster.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
args: user provided arguments for running the command.
|
|
182
|
+
system: system characteristics.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
0 if successful and 1 otherwise.
|
|
186
|
+
"""
|
|
187
|
+
command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
|
|
188
|
+
return_code = run_command_with_updates(
|
|
189
|
+
command, 'Install NRI Device Injector On Cluster', args
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if return_code != 0:
|
|
193
|
+
xpk_print(
|
|
194
|
+
'Install NRI Device Injector On Cluster request returned ERROR'
|
|
195
|
+
f' {return_code}'
|
|
196
|
+
)
|
|
197
|
+
return 1
|
|
198
|
+
|
|
129
199
|
return 0
|
|
130
200
|
|
|
131
201
|
|
|
202
|
+
def get_cluster_nodes_info(args) -> list[dict]:
|
|
203
|
+
"""Get list of cluster's nodes descrition in yaml format
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
args: user provided arguments for running the command.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of nodes info yaml objects.
|
|
210
|
+
"""
|
|
211
|
+
xpk_print("Getting cluster's info...")
|
|
212
|
+
command = 'kubectl get nodes -o yaml'
|
|
213
|
+
err_code, val = run_command_for_value(
|
|
214
|
+
command=command,
|
|
215
|
+
task='Get cluster nodes info',
|
|
216
|
+
global_args=args,
|
|
217
|
+
)
|
|
218
|
+
if err_code != 0:
|
|
219
|
+
xpk_exit(err_code)
|
|
220
|
+
data = yaml.safe_load(val)
|
|
221
|
+
return data['items'] # pytype: disable=bad-return-type
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
|
|
225
|
+
"""Count cluster nodes by accelerator type"""
|
|
226
|
+
nodes_info = get_cluster_nodes_info(args)
|
|
227
|
+
accelerators = [
|
|
228
|
+
node['metadata']['labels']['cloud.google.com/gke-accelerator']
|
|
229
|
+
for node in nodes_info
|
|
230
|
+
if 'cloud.google.com/gke-accelerator' in node['metadata']['labels']
|
|
231
|
+
]
|
|
232
|
+
if system.device_type != H200_DEVICE_TYPE:
|
|
233
|
+
xpk_print(
|
|
234
|
+
'Automatic node detection is not supported for device type:'
|
|
235
|
+
f' {system.device_type}'
|
|
236
|
+
)
|
|
237
|
+
xpk_exit(1)
|
|
238
|
+
num_nodes: int = sum(acc == system.gke_accelerator for acc in accelerators)
|
|
239
|
+
return num_nodes
|
|
240
|
+
|
|
241
|
+
|
|
132
242
|
def get_cluster_network(args) -> str:
|
|
133
243
|
xpk_print("Getting cluster's VPC network...")
|
|
134
244
|
cluster_network_cmd = (
|
|
@@ -205,28 +315,60 @@ def update_cluster_with_pd_driver_if_necessary(args) -> int:
|
|
|
205
315
|
return 0
|
|
206
316
|
|
|
207
317
|
|
|
208
|
-
def
|
|
318
|
+
def update_cluster_with_lustre_driver_if_necessary(args) -> int:
|
|
319
|
+
"""Updates a GKE cluster to enable Lustre CSI driver, if not enabled already.
|
|
320
|
+
Args:
|
|
321
|
+
args: user provided arguments for running the command.
|
|
322
|
+
Returns:
|
|
323
|
+
0 if successful and error code otherwise.
|
|
324
|
+
"""
|
|
325
|
+
if is_driver_enabled_on_cluster(
|
|
326
|
+
args, driver='lustreCsiDriver'
|
|
327
|
+
) and is_driver_enabled_on_cluster(
|
|
328
|
+
args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
|
|
329
|
+
):
|
|
330
|
+
return 0
|
|
331
|
+
cluster_update_return_code = update_gke_cluster_with_lustre_driver_enabled(
|
|
332
|
+
args
|
|
333
|
+
)
|
|
334
|
+
if cluster_update_return_code > 0:
|
|
335
|
+
xpk_print(
|
|
336
|
+
'Updating GKE cluster to enable PersistentDisk CSI driver failed!'
|
|
337
|
+
)
|
|
338
|
+
return cluster_update_return_code
|
|
339
|
+
|
|
340
|
+
return 0
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def is_driver_enabled_on_cluster(
|
|
344
|
+
args, driver: str, config_key: str = 'enabled', config_val: str = 'true'
|
|
345
|
+
) -> bool:
|
|
209
346
|
"""Checks if the CSI driver is enabled on the cluster.
|
|
210
347
|
Args:
|
|
211
348
|
args: user provided arguments for running the command.
|
|
212
349
|
driver (str) : name of the driver
|
|
350
|
+
config (str): the config to look for; by default looks for "enabled" parameter
|
|
351
|
+
config_val (str): the value indicating the enabled; default vale is "true"
|
|
213
352
|
Returns:
|
|
214
353
|
True if driver is enabled on the cluster and False otherwise.
|
|
215
354
|
"""
|
|
216
355
|
command = (
|
|
217
356
|
f'gcloud container clusters describe {args.cluster}'
|
|
218
357
|
f' --project={args.project} --region={zone_to_region(args.zone)}'
|
|
219
|
-
f' --format="value(addonsConfig.{driver}Config.
|
|
358
|
+
f' --format="value(addonsConfig.{driver}Config.{config_key})"'
|
|
220
359
|
)
|
|
221
360
|
return_code, driver_enabled = run_command_for_value(
|
|
222
361
|
command,
|
|
223
|
-
f
|
|
362
|
+
f"Checks if {driver} driver's {config_key} is enabled in cluster"
|
|
363
|
+
' describe.',
|
|
224
364
|
args,
|
|
225
365
|
)
|
|
226
366
|
if return_code != 0:
|
|
227
367
|
xpk_exit(return_code)
|
|
228
|
-
if driver_enabled.strip().lower() ==
|
|
229
|
-
xpk_print(
|
|
368
|
+
if driver_enabled.strip().lower() == config_val.lower():
|
|
369
|
+
xpk_print(
|
|
370
|
+
f"{driver} driver's {config_key} config is {config_val} on the cluster."
|
|
371
|
+
)
|
|
230
372
|
return True
|
|
231
373
|
return False
|
|
232
374
|
|
|
@@ -313,6 +455,19 @@ def get_gpu_type_from_cluster(args) -> str:
|
|
|
313
455
|
return ''
|
|
314
456
|
|
|
315
457
|
|
|
458
|
+
def setup_k8s_service_accounts() -> None:
|
|
459
|
+
"""
|
|
460
|
+
Creates/sets up SAs and the roles for them
|
|
461
|
+
"""
|
|
462
|
+
default_sa = 'default'
|
|
463
|
+
|
|
464
|
+
create_xpk_k8s_service_account()
|
|
465
|
+
|
|
466
|
+
role_name = create_pod_reader_role()
|
|
467
|
+
create_role_binding(default_sa, role_name)
|
|
468
|
+
create_role_binding(XPK_SA, role_name)
|
|
469
|
+
|
|
470
|
+
|
|
316
471
|
def create_xpk_k8s_service_account() -> None:
|
|
317
472
|
k8s_core_client = k8s_client.CoreV1Api()
|
|
318
473
|
sa = k8s_client.V1ServiceAccount(
|
|
@@ -331,6 +486,94 @@ def create_xpk_k8s_service_account() -> None:
|
|
|
331
486
|
)
|
|
332
487
|
|
|
333
488
|
|
|
489
|
+
def create_pod_reader_role() -> str:
|
|
490
|
+
"""
|
|
491
|
+
Creates the 'pod-reader' Role in the default namespace.
|
|
492
|
+
"""
|
|
493
|
+
k8s_rbac_client = k8s_client.RbacAuthorizationV1Api()
|
|
494
|
+
role_name = 'pod-reader'
|
|
495
|
+
|
|
496
|
+
role = k8s_client.V1Role(
|
|
497
|
+
metadata=k8s_client.V1ObjectMeta(
|
|
498
|
+
name=role_name, namespace=DEFAULT_NAMESPACE
|
|
499
|
+
),
|
|
500
|
+
rules=[
|
|
501
|
+
k8s_client.V1PolicyRule(
|
|
502
|
+
api_groups=[''],
|
|
503
|
+
resources=['pods', 'services'],
|
|
504
|
+
verbs=['get', 'list', 'watch'],
|
|
505
|
+
),
|
|
506
|
+
k8s_client.V1PolicyRule(
|
|
507
|
+
api_groups=['batch'],
|
|
508
|
+
resources=['jobs'],
|
|
509
|
+
verbs=['get', 'list', 'watch'],
|
|
510
|
+
),
|
|
511
|
+
],
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
xpk_print(
|
|
515
|
+
f'Attempting to create Role: {role_name} in namespace:'
|
|
516
|
+
f' {DEFAULT_NAMESPACE}'
|
|
517
|
+
)
|
|
518
|
+
try:
|
|
519
|
+
k8s_rbac_client.create_namespaced_role(DEFAULT_NAMESPACE, role, pretty=True)
|
|
520
|
+
xpk_print(f'Successfully created Role: {role_name}')
|
|
521
|
+
return role_name
|
|
522
|
+
except ApiException as e:
|
|
523
|
+
if e.status == 409: # Conflict, meaning it already exists
|
|
524
|
+
xpk_print(f'Role: {role_name} already exists. Skipping its creation.')
|
|
525
|
+
return role_name
|
|
526
|
+
else:
|
|
527
|
+
xpk_print(f'Error creating Role {role_name}: {e}')
|
|
528
|
+
xpk_exit(1)
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def create_role_binding(sa: str, role_name: str) -> None:
|
|
532
|
+
"""
|
|
533
|
+
Creates a RoleBinding to associate the Service Account
|
|
534
|
+
with the Role in the default namespace.
|
|
535
|
+
Assumes the Service Account and the Role already exist.
|
|
536
|
+
"""
|
|
537
|
+
k8s_rbac_client = k8s_client.RbacAuthorizationV1Api()
|
|
538
|
+
role_binding_name = f'{sa}-{role_name}-binding'
|
|
539
|
+
|
|
540
|
+
role_binding = k8s_client.V1RoleBinding(
|
|
541
|
+
metadata=k8s_client.V1ObjectMeta(
|
|
542
|
+
name=role_binding_name, namespace=DEFAULT_NAMESPACE
|
|
543
|
+
),
|
|
544
|
+
subjects=[
|
|
545
|
+
k8s_client.RbacV1Subject(
|
|
546
|
+
kind='ServiceAccount', name=sa, namespace=DEFAULT_NAMESPACE
|
|
547
|
+
)
|
|
548
|
+
],
|
|
549
|
+
role_ref=k8s_client.V1RoleRef(
|
|
550
|
+
kind='Role', name=role_name, api_group='rbac.authorization.k8s.io'
|
|
551
|
+
),
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
xpk_print(
|
|
555
|
+
f'Attempting to create RoleBinding: {role_binding_name} for Service'
|
|
556
|
+
f' Account: {XPK_SA} to Role: {role_name} in namespace:'
|
|
557
|
+
f' {DEFAULT_NAMESPACE}'
|
|
558
|
+
)
|
|
559
|
+
try:
|
|
560
|
+
k8s_rbac_client.create_namespaced_role_binding(
|
|
561
|
+
DEFAULT_NAMESPACE, role_binding, pretty=True
|
|
562
|
+
)
|
|
563
|
+
xpk_print(
|
|
564
|
+
f'Successfully created RoleBinding: {role_binding_name} for {XPK_SA}'
|
|
565
|
+
)
|
|
566
|
+
except ApiException as e:
|
|
567
|
+
if e.status == 409: # Conflict, meaning it already exists
|
|
568
|
+
xpk_print(
|
|
569
|
+
f'RoleBinding: {role_binding_name} already exists. Skipping its'
|
|
570
|
+
' creation.'
|
|
571
|
+
)
|
|
572
|
+
else:
|
|
573
|
+
xpk_print(f'Error creating RoleBinding {role_binding_name}: {e}')
|
|
574
|
+
xpk_exit(1)
|
|
575
|
+
|
|
576
|
+
|
|
334
577
|
def update_gke_cluster_with_clouddns(args) -> int:
|
|
335
578
|
"""Run the GKE cluster update command for existing clusters and enable CloudDNS.
|
|
336
579
|
|
|
@@ -412,6 +655,32 @@ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
|
|
|
412
655
|
return 0
|
|
413
656
|
|
|
414
657
|
|
|
658
|
+
def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
|
|
659
|
+
"""Run the GKE cluster update command for existing cluster and enable Lustre CSI driver.
|
|
660
|
+
Args:
|
|
661
|
+
args: user provided arguments for running the command.
|
|
662
|
+
Returns:
|
|
663
|
+
0 if successful and 1 otherwise.
|
|
664
|
+
"""
|
|
665
|
+
command = (
|
|
666
|
+
'gcloud container clusters update'
|
|
667
|
+
f' {args.cluster} --project={args.project}'
|
|
668
|
+
f' --region={zone_to_region(args.zone)}'
|
|
669
|
+
' --enable-legacy-lustre-port'
|
|
670
|
+
' --quiet'
|
|
671
|
+
)
|
|
672
|
+
xpk_print(
|
|
673
|
+
'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
|
|
674
|
+
)
|
|
675
|
+
return_code = run_command_with_updates(
|
|
676
|
+
command, 'GKE Cluster Update to enable Lustre CSI driver', args
|
|
677
|
+
)
|
|
678
|
+
if return_code != 0:
|
|
679
|
+
xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
|
|
680
|
+
return 1
|
|
681
|
+
return 0
|
|
682
|
+
|
|
683
|
+
|
|
415
684
|
def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
|
|
416
685
|
"""Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
|
|
417
686
|
|
xpk/core/config.py
CHANGED
|
@@ -15,16 +15,14 @@ limitations under the License.
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
import os
|
|
18
|
-
import re
|
|
19
18
|
|
|
20
19
|
import ruamel.yaml
|
|
21
20
|
|
|
22
21
|
from ..utils import file
|
|
23
22
|
from ..utils.console import xpk_print
|
|
24
|
-
from .system_characteristics import AcceleratorType, SystemCharacteristics
|
|
25
23
|
|
|
26
24
|
# This is the version for XPK PyPI package
|
|
27
|
-
__version__ = 'v0.
|
|
25
|
+
__version__ = 'v0.10.0'
|
|
28
26
|
XPK_CURRENT_VERSION = __version__
|
|
29
27
|
XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
|
|
30
28
|
|
|
@@ -117,65 +115,3 @@ class XpkConfig:
|
|
|
117
115
|
return None
|
|
118
116
|
val: dict[str, str] = config_yaml[CONFIGS_KEY]
|
|
119
117
|
return val
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
|
|
123
|
-
"""Parses the environment configurations to the jobset config.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
args: user provided arguments for running the command.
|
|
127
|
-
tensorboard_config: configuration of Vertex Tensorboard.
|
|
128
|
-
system: system characteristics.
|
|
129
|
-
"""
|
|
130
|
-
env = {}
|
|
131
|
-
|
|
132
|
-
env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
|
|
133
|
-
if args.env_file:
|
|
134
|
-
print('Setting container environment from', args.env_file)
|
|
135
|
-
with open(file=args.env_file, mode='r', encoding='utf-8') as f:
|
|
136
|
-
for match in env_pat.finditer(f.read()):
|
|
137
|
-
variable = match.group(1)
|
|
138
|
-
if match.group(2) is not None:
|
|
139
|
-
env[variable] = match.group(2)
|
|
140
|
-
else:
|
|
141
|
-
assert variable in os.environ, (
|
|
142
|
-
f'Variable {variable} is not set in the current '
|
|
143
|
-
'environment, a value must be specified.'
|
|
144
|
-
)
|
|
145
|
-
env[variable] = os.environ[variable]
|
|
146
|
-
if args.env:
|
|
147
|
-
for var in args.env:
|
|
148
|
-
match = env_pat.match(var)
|
|
149
|
-
assert match and match.group(2) is not None, (
|
|
150
|
-
'Invalid environment variable, format must be '
|
|
151
|
-
f'`--env VARIABLE=value`: {var}'
|
|
152
|
-
)
|
|
153
|
-
variable = match.group(1)
|
|
154
|
-
env[variable] = match.group(2)
|
|
155
|
-
|
|
156
|
-
if not args.use_pathways:
|
|
157
|
-
if args.debug_dump_gcs:
|
|
158
|
-
if 'XLA_FLAGS' in env:
|
|
159
|
-
raise ValueError(
|
|
160
|
-
'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
|
|
161
|
-
'and environment file. Please choose one way to define '
|
|
162
|
-
'XLA_FLAGS.'
|
|
163
|
-
)
|
|
164
|
-
env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
|
|
165
|
-
|
|
166
|
-
if tensorboard_config:
|
|
167
|
-
env['UPLOAD_DATA_TO_TENSORBOARD'] = True
|
|
168
|
-
for key, value in tensorboard_config.items():
|
|
169
|
-
env[key.upper()] = value
|
|
170
|
-
|
|
171
|
-
if system.accelerator_type == AcceleratorType['GPU']:
|
|
172
|
-
# For GPUs, it has two more spaces ahead of name and value respectively
|
|
173
|
-
env_format = '''
|
|
174
|
-
- name: {key}
|
|
175
|
-
value: "{value}"'''
|
|
176
|
-
else:
|
|
177
|
-
env_format = '''
|
|
178
|
-
- name: {key}
|
|
179
|
-
value: "{value}"'''
|
|
180
|
-
|
|
181
|
-
args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
|
xpk/core/docker_manager.py
CHANGED
|
@@ -30,7 +30,7 @@ import time
|
|
|
30
30
|
DockerRunCommandExitCode = 135
|
|
31
31
|
dockerBuildErrorCode = 134
|
|
32
32
|
ctk_dockerfile_path = "Dockerfile"
|
|
33
|
-
ctk_build_ref = "v1.
|
|
33
|
+
ctk_build_ref = "v1.57.1"
|
|
34
34
|
ctk_docker_image = "xpk-ctk"
|
|
35
35
|
ctk_container_name = "xpk-ctk-container"
|
|
36
36
|
gcloud_cfg_mount_path = "/root/.config/gcloud"
|