xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. xpk/commands/batch.py +5 -6
  2. xpk/commands/cluster.py +246 -73
  3. xpk/commands/cluster_gcluster.py +27 -0
  4. xpk/commands/common.py +40 -1
  5. xpk/commands/kjob_common.py +13 -1
  6. xpk/commands/run.py +4 -5
  7. xpk/commands/shell.py +2 -2
  8. xpk/commands/storage.py +24 -6
  9. xpk/commands/workload.py +66 -27
  10. xpk/core/blueprint/blueprint_generator.py +115 -47
  11. xpk/core/capacity.py +66 -6
  12. xpk/core/cluster.py +282 -13
  13. xpk/core/config.py +1 -65
  14. xpk/core/docker_manager.py +1 -1
  15. xpk/core/docker_resources.py +145 -72
  16. xpk/core/filestore.py +2 -6
  17. xpk/core/gcsfuse.py +22 -4
  18. xpk/core/jobset.py +143 -0
  19. xpk/core/kjob.py +21 -18
  20. xpk/core/kueue.py +194 -4
  21. xpk/core/mtc.py +195 -0
  22. xpk/core/network.py +23 -1
  23. xpk/core/nodepool.py +17 -4
  24. xpk/core/pathways.py +2 -3
  25. xpk/core/resources.py +21 -0
  26. xpk/core/storage.py +1 -95
  27. xpk/core/system_characteristics.py +1 -1
  28. xpk/core/workload.py +1 -45
  29. xpk/core/workload_decorators/rdma_decorator.py +8 -10
  30. xpk/core/workload_decorators/tcpx_decorator.py +185 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
  32. xpk/parser/cluster.py +589 -389
  33. xpk/parser/storage.py +12 -3
  34. xpk/parser/workload.py +21 -3
  35. xpk/utils/kubectl.py +4 -1
  36. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
  37. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
  38. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
  39. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
  40. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
  41. {xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0
xpk/core/capacity.py CHANGED
@@ -16,8 +16,8 @@ limitations under the License.
16
16
 
17
17
  import enum
18
18
 
19
- from ..utils.console import xpk_print
20
- from .commands import run_command_with_updates
19
+ from ..utils.console import xpk_print, xpk_exit
20
+ from .commands import run_command_with_updates, run_command_for_value
21
21
 
22
22
  AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
23
23
  AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
@@ -36,6 +36,7 @@ class CapacityType(enum.Enum):
36
36
  RESERVATION = 'reservation'
37
37
  SPOT = 'spot'
38
38
  UNKNOWN = 'unknown'
39
+ FLEX_START = 'flex_start'
39
40
 
40
41
 
41
42
  def print_reservations(args) -> int:
@@ -84,6 +85,9 @@ def get_capacity_type(args) -> tuple[CapacityType, int]:
84
85
  if args.spot:
85
86
  capacity_type = CapacityType.SPOT
86
87
  num_types += 1
88
+ if args.flex:
89
+ capacity_type = CapacityType.FLEX_START
90
+ num_types += 1
87
91
 
88
92
  # Check that the number of user arguments provided is valid.
89
93
  if num_types == 0:
@@ -91,14 +95,62 @@ def get_capacity_type(args) -> tuple[CapacityType, int]:
91
95
  elif num_types != 1:
92
96
  xpk_print(
93
97
  'ERROR: User specified more than one of the following arguments. Please'
94
- ' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`'
95
- ' or `--spot`.'
98
+ ' specify only one of `--reservation=$RESERVATION_NAME`, `--on-demand`,'
99
+ ' `--flex` or `--spot`.'
96
100
  )
97
101
  return_code = 1
98
102
 
99
103
  return capacity_type, return_code
100
104
 
101
105
 
106
+ def get_reservation_maintenance_interval(
107
+ reservation: str, zone: str, project: str
108
+ ) -> str:
109
+ """Get reservation maintenance interval.
110
+
111
+ Args:
112
+ args: user provided arguments for running the command.
113
+
114
+ Returns:
115
+ 0 if successful and 1 otherwise.
116
+ """
117
+ command = (
118
+ f'gcloud beta compute reservations describe {reservation}'
119
+ f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
120
+ )
121
+ return_code, output = run_command_for_value(
122
+ command, 'Get reservation maintenance interval', None
123
+ )
124
+ if return_code != 0:
125
+ xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
126
+ xpk_exit(1)
127
+ return output.strip()
128
+
129
+
130
+ def get_reservation_placement_policy(
131
+ reservation: str, zone: str, project: str
132
+ ) -> str:
133
+ """Get reservation placement policy.
134
+
135
+ Args:
136
+ args: user provided arguments for running the command.
137
+
138
+ Returns:
139
+ 0 if successful and 1 otherwise.
140
+ """
141
+ command = (
142
+ f'gcloud beta compute reservations describe {reservation}'
143
+ f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
144
+ )
145
+ return_code, output = run_command_for_value(
146
+ command, 'Get reservation placement policy', None
147
+ )
148
+ if return_code != 0:
149
+ xpk_print(f'Get reservation placement policy ERROR {return_code}')
150
+ xpk_exit(1)
151
+ return output.strip()
152
+
153
+
102
154
  def verify_reservation_exists(args) -> int:
103
155
  """Verify the reservation exists.
104
156
 
@@ -121,9 +173,9 @@ def verify_reservation_exists(args) -> int:
121
173
 
122
174
 
123
175
  def get_capacity_arguments_from_capacity_type(
124
- args, capacity_type: CapacityType
176
+ args, capacity_type: CapacityType, max_nodes: int
125
177
  ) -> tuple[str, int]:
126
- """Determine the TPU Nodepool creation capacity arguments needed.
178
+ """Determine the Nodepool creation capacity arguments needed.
127
179
 
128
180
  Args:
129
181
  args: user provided arguments for running the command.
@@ -141,6 +193,12 @@ def get_capacity_arguments_from_capacity_type(
141
193
  capacity_args = ''
142
194
  case CapacityType.SPOT:
143
195
  capacity_args = '--spot'
196
+ case CapacityType.FLEX_START:
197
+ capacity_args = (
198
+ ' --flex-start --enable-queued-provisioning --enable-autoscaling'
199
+ ' --location-policy=ANY --reservation-affinity=none'
200
+ f' --no-enable-autorepair --max-nodes={max_nodes}'
201
+ )
144
202
  case CapacityType.RESERVATION:
145
203
  capacity_args = (
146
204
  f'--reservation-affinity=specific --reservation={args.reservation}'
@@ -173,6 +231,8 @@ def get_capacity_node_selectors_from_capacity_type(
173
231
  match capacity_type:
174
232
  case CapacityType.ON_DEMAND.name:
175
233
  node_selector = ''
234
+ case CapacityType.FLEX_START.name:
235
+ node_selector = 'cloud.google.com/gke-queued="true"'
176
236
  case CapacityType.SPOT.name:
177
237
  node_selector = 'cloud.google.com/gke-spot="true"'
178
238
  case CapacityType.RESERVATION.name:
xpk/core/cluster.py CHANGED
@@ -14,28 +14,37 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ import yaml
17
18
  from google.api_core.exceptions import PermissionDenied
18
19
  from google.cloud import resourcemanager_v3
19
20
  from kubernetes import client as k8s_client
20
21
  from kubernetes import config
21
22
  from kubernetes.client.exceptions import ApiException
22
- from .resources import get_cluster_system_characteristics
23
23
 
24
24
  from ..utils.console import xpk_exit, xpk_print
25
- from .capacity import H100_DEVICE_TYPE
25
+ from .capacity import B200_DEVICE_TYPE, H100_DEVICE_TYPE, H200_DEVICE_TYPE
26
26
  from .commands import (
27
27
  run_command_for_value,
28
28
  run_command_with_updates,
29
29
  run_command_with_updates_retry,
30
30
  )
31
- from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to_region
31
+ from .gcloud_context import (
32
+ add_zone_and_project,
33
+ get_gke_server_config,
34
+ zone_to_region,
35
+ )
32
36
  from .nodepool import upgrade_gke_nodepools_version
37
+ from .resources import get_cluster_system_characteristics
33
38
  from .system_characteristics import SystemCharacteristics
34
39
 
35
40
  JOBSET_VERSION = 'v0.8.0'
36
- PATHWAYS_JOB_VERSION = 'v0.1.0'
37
- INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
38
- INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
41
+ PATHWAYS_JOB_VERSION = 'v0.1.2'
42
+ INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
43
+ INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
44
+ INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
45
+ CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
46
+ NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
47
+ MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
39
48
 
40
49
  DEFAULT_NAMESPACE = 'default'
41
50
  XPK_SA = 'xpk-sa'
@@ -112,9 +121,11 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
112
121
  0 if successful and 1 otherwise.
113
122
  """
114
123
  if system.device_type == H100_DEVICE_TYPE:
115
- command = f'kubectl apply -f {INSTALLER_NCC_TCPX}'
124
+ command = f'kubectl apply -f {INSTALLER_NCCL_TCPX}'
125
+ elif system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
126
+ command = f'kubectl apply -f {INSTALLER_NCCL_RDMA}'
116
127
  else:
117
- command = f'kubectl apply -f {INSTALLER_NCC_TCPXO}'
128
+ command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
118
129
 
119
130
  return_code = run_command_with_updates(
120
131
  command, 'Install NCCL Plugin On Cluster', args
@@ -126,9 +137,108 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
126
137
  )
127
138
  return 1
128
139
 
140
+ if system.device_type == H100_DEVICE_TYPE:
141
+ command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
142
+
143
+ return_code = run_command_with_updates(
144
+ command, 'Install NCCL Config On Cluster', args
145
+ )
146
+
147
+ if return_code != 0:
148
+ xpk_print(
149
+ f'Install NCCL Config On Cluster request returned ERROR {return_code}'
150
+ )
151
+ return 1
152
+
153
+ return 0
154
+
155
+
156
+ def disable_mglru_on_cluster(args) -> int:
157
+ """Disable MGLRU on the cluster.
158
+
159
+ Args:
160
+ args: user provided arguments for running the command.
161
+
162
+ Returns:
163
+ 0 if successful and 1 otherwise.
164
+ """
165
+ command = f'kubectl apply -f {MGLRU_DISABLE}'
166
+ return_code = run_command_with_updates(
167
+ command, 'Disable MGLRU On Cluster', args
168
+ )
169
+
170
+ if return_code != 0:
171
+ xpk_print('Disablig MGLRU On Cluster request returned ERROR')
172
+ return 1
173
+
174
+ return 0
175
+
176
+
177
+ def install_nri_on_cluster(args) -> int:
178
+ """Install NRI Device Injector on the cluster.
179
+
180
+ Args:
181
+ args: user provided arguments for running the command.
182
+ system: system characteristics.
183
+
184
+ Returns:
185
+ 0 if successful and 1 otherwise.
186
+ """
187
+ command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
188
+ return_code = run_command_with_updates(
189
+ command, 'Install NRI Device Injector On Cluster', args
190
+ )
191
+
192
+ if return_code != 0:
193
+ xpk_print(
194
+ 'Install NRI Device Injector On Cluster request returned ERROR'
195
+ f' {return_code}'
196
+ )
197
+ return 1
198
+
129
199
  return 0
130
200
 
131
201
 
202
+ def get_cluster_nodes_info(args) -> list[dict]:
203
+ """Get list of cluster's nodes descrition in yaml format
204
+
205
+ Args:
206
+ args: user provided arguments for running the command.
207
+
208
+ Returns:
209
+ List of nodes info yaml objects.
210
+ """
211
+ xpk_print("Getting cluster's info...")
212
+ command = 'kubectl get nodes -o yaml'
213
+ err_code, val = run_command_for_value(
214
+ command=command,
215
+ task='Get cluster nodes info',
216
+ global_args=args,
217
+ )
218
+ if err_code != 0:
219
+ xpk_exit(err_code)
220
+ data = yaml.safe_load(val)
221
+ return data['items'] # pytype: disable=bad-return-type
222
+
223
+
224
+ def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
225
+ """Count cluster nodes by accelerator type"""
226
+ nodes_info = get_cluster_nodes_info(args)
227
+ accelerators = [
228
+ node['metadata']['labels']['cloud.google.com/gke-accelerator']
229
+ for node in nodes_info
230
+ if 'cloud.google.com/gke-accelerator' in node['metadata']['labels']
231
+ ]
232
+ if system.device_type != H200_DEVICE_TYPE:
233
+ xpk_print(
234
+ 'Automatic node detection is not supported for device type:'
235
+ f' {system.device_type}'
236
+ )
237
+ xpk_exit(1)
238
+ num_nodes: int = sum(acc == system.gke_accelerator for acc in accelerators)
239
+ return num_nodes
240
+
241
+
132
242
  def get_cluster_network(args) -> str:
133
243
  xpk_print("Getting cluster's VPC network...")
134
244
  cluster_network_cmd = (
@@ -205,28 +315,60 @@ def update_cluster_with_pd_driver_if_necessary(args) -> int:
205
315
  return 0
206
316
 
207
317
 
208
- def is_driver_enabled_on_cluster(args, driver: str) -> bool:
318
+ def update_cluster_with_lustre_driver_if_necessary(args) -> int:
319
+ """Updates a GKE cluster to enable Lustre CSI driver, if not enabled already.
320
+ Args:
321
+ args: user provided arguments for running the command.
322
+ Returns:
323
+ 0 if successful and error code otherwise.
324
+ """
325
+ if is_driver_enabled_on_cluster(
326
+ args, driver='lustreCsiDriver'
327
+ ) and is_driver_enabled_on_cluster(
328
+ args, driver='lustreCsiDriver', config_key='enableLegacyLustrePort'
329
+ ):
330
+ return 0
331
+ cluster_update_return_code = update_gke_cluster_with_lustre_driver_enabled(
332
+ args
333
+ )
334
+ if cluster_update_return_code > 0:
335
+ xpk_print(
336
+ 'Updating GKE cluster to enable PersistentDisk CSI driver failed!'
337
+ )
338
+ return cluster_update_return_code
339
+
340
+ return 0
341
+
342
+
343
+ def is_driver_enabled_on_cluster(
344
+ args, driver: str, config_key: str = 'enabled', config_val: str = 'true'
345
+ ) -> bool:
209
346
  """Checks if the CSI driver is enabled on the cluster.
210
347
  Args:
211
348
  args: user provided arguments for running the command.
212
349
  driver (str) : name of the driver
350
+ config (str): the config to look for; by default looks for "enabled" parameter
351
+ config_val (str): the value indicating the enabled; default vale is "true"
213
352
  Returns:
214
353
  True if driver is enabled on the cluster and False otherwise.
215
354
  """
216
355
  command = (
217
356
  f'gcloud container clusters describe {args.cluster}'
218
357
  f' --project={args.project} --region={zone_to_region(args.zone)}'
219
- f' --format="value(addonsConfig.{driver}Config.enabled)"'
358
+ f' --format="value(addonsConfig.{driver}Config.{config_key})"'
220
359
  )
221
360
  return_code, driver_enabled = run_command_for_value(
222
361
  command,
223
- f'Checks if {driver} driver is enabled in cluster describe.',
362
+ f"Checks if {driver} driver's {config_key} is enabled in cluster"
363
+ ' describe.',
224
364
  args,
225
365
  )
226
366
  if return_code != 0:
227
367
  xpk_exit(return_code)
228
- if driver_enabled.strip().lower() == 'true':
229
- xpk_print(f'{driver} driver is enabled on the cluster, no update needed.')
368
+ if driver_enabled.strip().lower() == config_val.lower():
369
+ xpk_print(
370
+ f"{driver} driver's {config_key} config is {config_val} on the cluster."
371
+ )
230
372
  return True
231
373
  return False
232
374
 
@@ -313,6 +455,19 @@ def get_gpu_type_from_cluster(args) -> str:
313
455
  return ''
314
456
 
315
457
 
458
+ def setup_k8s_service_accounts() -> None:
459
+ """
460
+ Creates/sets up SAs and the roles for them
461
+ """
462
+ default_sa = 'default'
463
+
464
+ create_xpk_k8s_service_account()
465
+
466
+ role_name = create_pod_reader_role()
467
+ create_role_binding(default_sa, role_name)
468
+ create_role_binding(XPK_SA, role_name)
469
+
470
+
316
471
  def create_xpk_k8s_service_account() -> None:
317
472
  k8s_core_client = k8s_client.CoreV1Api()
318
473
  sa = k8s_client.V1ServiceAccount(
@@ -331,6 +486,94 @@ def create_xpk_k8s_service_account() -> None:
331
486
  )
332
487
 
333
488
 
489
+ def create_pod_reader_role() -> str:
490
+ """
491
+ Creates the 'pod-reader' Role in the default namespace.
492
+ """
493
+ k8s_rbac_client = k8s_client.RbacAuthorizationV1Api()
494
+ role_name = 'pod-reader'
495
+
496
+ role = k8s_client.V1Role(
497
+ metadata=k8s_client.V1ObjectMeta(
498
+ name=role_name, namespace=DEFAULT_NAMESPACE
499
+ ),
500
+ rules=[
501
+ k8s_client.V1PolicyRule(
502
+ api_groups=[''],
503
+ resources=['pods', 'services'],
504
+ verbs=['get', 'list', 'watch'],
505
+ ),
506
+ k8s_client.V1PolicyRule(
507
+ api_groups=['batch'],
508
+ resources=['jobs'],
509
+ verbs=['get', 'list', 'watch'],
510
+ ),
511
+ ],
512
+ )
513
+
514
+ xpk_print(
515
+ f'Attempting to create Role: {role_name} in namespace:'
516
+ f' {DEFAULT_NAMESPACE}'
517
+ )
518
+ try:
519
+ k8s_rbac_client.create_namespaced_role(DEFAULT_NAMESPACE, role, pretty=True)
520
+ xpk_print(f'Successfully created Role: {role_name}')
521
+ return role_name
522
+ except ApiException as e:
523
+ if e.status == 409: # Conflict, meaning it already exists
524
+ xpk_print(f'Role: {role_name} already exists. Skipping its creation.')
525
+ return role_name
526
+ else:
527
+ xpk_print(f'Error creating Role {role_name}: {e}')
528
+ xpk_exit(1)
529
+
530
+
531
+ def create_role_binding(sa: str, role_name: str) -> None:
532
+ """
533
+ Creates a RoleBinding to associate the Service Account
534
+ with the Role in the default namespace.
535
+ Assumes the Service Account and the Role already exist.
536
+ """
537
+ k8s_rbac_client = k8s_client.RbacAuthorizationV1Api()
538
+ role_binding_name = f'{sa}-{role_name}-binding'
539
+
540
+ role_binding = k8s_client.V1RoleBinding(
541
+ metadata=k8s_client.V1ObjectMeta(
542
+ name=role_binding_name, namespace=DEFAULT_NAMESPACE
543
+ ),
544
+ subjects=[
545
+ k8s_client.RbacV1Subject(
546
+ kind='ServiceAccount', name=sa, namespace=DEFAULT_NAMESPACE
547
+ )
548
+ ],
549
+ role_ref=k8s_client.V1RoleRef(
550
+ kind='Role', name=role_name, api_group='rbac.authorization.k8s.io'
551
+ ),
552
+ )
553
+
554
+ xpk_print(
555
+ f'Attempting to create RoleBinding: {role_binding_name} for Service'
556
+ f' Account: {XPK_SA} to Role: {role_name} in namespace:'
557
+ f' {DEFAULT_NAMESPACE}'
558
+ )
559
+ try:
560
+ k8s_rbac_client.create_namespaced_role_binding(
561
+ DEFAULT_NAMESPACE, role_binding, pretty=True
562
+ )
563
+ xpk_print(
564
+ f'Successfully created RoleBinding: {role_binding_name} for {XPK_SA}'
565
+ )
566
+ except ApiException as e:
567
+ if e.status == 409: # Conflict, meaning it already exists
568
+ xpk_print(
569
+ f'RoleBinding: {role_binding_name} already exists. Skipping its'
570
+ ' creation.'
571
+ )
572
+ else:
573
+ xpk_print(f'Error creating RoleBinding {role_binding_name}: {e}')
574
+ xpk_exit(1)
575
+
576
+
334
577
  def update_gke_cluster_with_clouddns(args) -> int:
335
578
  """Run the GKE cluster update command for existing clusters and enable CloudDNS.
336
579
 
@@ -412,6 +655,32 @@ def update_gke_cluster_with_gcsfuse_driver_enabled(args) -> int:
412
655
  return 0
413
656
 
414
657
 
658
+ def update_gke_cluster_with_lustre_driver_enabled(args) -> int:
659
+ """Run the GKE cluster update command for existing cluster and enable Lustre CSI driver.
660
+ Args:
661
+ args: user provided arguments for running the command.
662
+ Returns:
663
+ 0 if successful and 1 otherwise.
664
+ """
665
+ command = (
666
+ 'gcloud container clusters update'
667
+ f' {args.cluster} --project={args.project}'
668
+ f' --region={zone_to_region(args.zone)}'
669
+ ' --enable-legacy-lustre-port'
670
+ ' --quiet'
671
+ )
672
+ xpk_print(
673
+ 'Updating GKE cluster to enable Lustre CSI driver, may take a while!'
674
+ )
675
+ return_code = run_command_with_updates(
676
+ command, 'GKE Cluster Update to enable Lustre CSI driver', args
677
+ )
678
+ if return_code != 0:
679
+ xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
680
+ return 1
681
+ return 0
682
+
683
+
415
684
  def upgrade_gke_control_plane_version(args, default_rapid_gke_version) -> int:
416
685
  """Upgrade GKE cluster's control plane version before updating nodepools to use CloudDNS.
417
686
 
xpk/core/config.py CHANGED
@@ -15,16 +15,14 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import os
18
- import re
19
18
 
20
19
  import ruamel.yaml
21
20
 
22
21
  from ..utils import file
23
22
  from ..utils.console import xpk_print
24
- from .system_characteristics import AcceleratorType, SystemCharacteristics
25
23
 
26
24
  # This is the version for XPK PyPI package
27
- __version__ = 'v0.8.0'
25
+ __version__ = 'v0.10.0'
28
26
  XPK_CURRENT_VERSION = __version__
29
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
30
28
 
@@ -117,65 +115,3 @@ class XpkConfig:
117
115
  return None
118
116
  val: dict[str, str] = config_yaml[CONFIGS_KEY]
119
117
  return val
120
-
121
-
122
- def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
123
- """Parses the environment configurations to the jobset config.
124
-
125
- Args:
126
- args: user provided arguments for running the command.
127
- tensorboard_config: configuration of Vertex Tensorboard.
128
- system: system characteristics.
129
- """
130
- env = {}
131
-
132
- env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
133
- if args.env_file:
134
- print('Setting container environment from', args.env_file)
135
- with open(file=args.env_file, mode='r', encoding='utf-8') as f:
136
- for match in env_pat.finditer(f.read()):
137
- variable = match.group(1)
138
- if match.group(2) is not None:
139
- env[variable] = match.group(2)
140
- else:
141
- assert variable in os.environ, (
142
- f'Variable {variable} is not set in the current '
143
- 'environment, a value must be specified.'
144
- )
145
- env[variable] = os.environ[variable]
146
- if args.env:
147
- for var in args.env:
148
- match = env_pat.match(var)
149
- assert match and match.group(2) is not None, (
150
- 'Invalid environment variable, format must be '
151
- f'`--env VARIABLE=value`: {var}'
152
- )
153
- variable = match.group(1)
154
- env[variable] = match.group(2)
155
-
156
- if not args.use_pathways:
157
- if args.debug_dump_gcs:
158
- if 'XLA_FLAGS' in env:
159
- raise ValueError(
160
- 'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
161
- 'and environment file. Please choose one way to define '
162
- 'XLA_FLAGS.'
163
- )
164
- env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
165
-
166
- if tensorboard_config:
167
- env['UPLOAD_DATA_TO_TENSORBOARD'] = True
168
- for key, value in tensorboard_config.items():
169
- env[key.upper()] = value
170
-
171
- if system.accelerator_type == AcceleratorType['GPU']:
172
- # For GPUs, it has two more spaces ahead of name and value respectively
173
- env_format = '''
174
- - name: {key}
175
- value: "{value}"'''
176
- else:
177
- env_format = '''
178
- - name: {key}
179
- value: "{value}"'''
180
-
181
- args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
@@ -30,7 +30,7 @@ import time
30
30
  DockerRunCommandExitCode = 135
31
31
  dockerBuildErrorCode = 134
32
32
  ctk_dockerfile_path = "Dockerfile"
33
- ctk_build_ref = "v1.48.0"
33
+ ctk_build_ref = "v1.57.1"
34
34
  ctk_docker_image = "xpk-ctk"
35
35
  ctk_container_name = "xpk-ctk-container"
36
36
  gcloud_cfg_mount_path = "/root/.config/gcloud"