xpk 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/batch.py CHANGED
@@ -18,7 +18,7 @@ import re
18
18
  from argparse import Namespace
19
19
 
20
20
  from ..core.cluster import (
21
- create_xpk_k8s_service_account,
21
+ setup_k8s_service_accounts,
22
22
  get_cluster_credentials,
23
23
  )
24
24
  from ..core.commands import run_command_for_value
@@ -54,14 +54,14 @@ def batch(args: Namespace) -> None:
54
54
  err_code = prepare_kjob(args)
55
55
  if err_code > 0:
56
56
  xpk_exit(err_code)
57
- create_xpk_k8s_service_account()
57
+ setup_k8s_service_accounts()
58
58
 
59
59
  submit_job(args)
60
60
 
61
61
 
62
62
  def submit_job(args: Namespace) -> None:
63
63
 
64
- create_xpk_k8s_service_account()
64
+ setup_k8s_service_accounts()
65
65
 
66
66
  cmd = (
67
67
  'kubectl kjob create slurm'
xpk/commands/cluster.py CHANGED
@@ -31,6 +31,7 @@ from ..core.cluster import (
31
31
  update_cluster_with_gcsfuse_driver_if_necessary,
32
32
  update_cluster_with_parallelstore_driver_if_necessary,
33
33
  update_cluster_with_pd_driver_if_necessary,
34
+ update_cluster_with_lustre_driver_if_necessary,
34
35
  update_cluster_with_workload_identity_if_necessary,
35
36
  )
36
37
  from ..core.cluster_private import authorize_private_cluster_access_if_necessary
@@ -42,12 +43,14 @@ from ..core.gcloud_context import (
42
43
  get_gke_server_config,
43
44
  zone_to_region,
44
45
  )
46
+ from ..core.jobset import update_jobset_resources_if_necessary
45
47
  from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
46
48
  from ..core.kueue import (
47
49
  cluster_preheat_yml,
48
50
  install_kueue_crs,
49
51
  install_kueue_on_cluster,
50
52
  wait_for_kueue_available,
53
+ update_kueue_resources_if_necessary,
51
54
  )
52
55
  from ..core.nap import enable_autoprovisioning_on_cluster
53
56
  from ..core.network import (
@@ -170,7 +173,6 @@ def cluster_adapt(args) -> None:
170
173
  install_kueue(args, system, autoprovisioning_config)
171
174
 
172
175
  install_kjob(args)
173
-
174
176
  if system.accelerator_type == AcceleratorType['GPU']:
175
177
  prepare_gpus(args, system)
176
178
 
@@ -308,6 +310,9 @@ def cluster_create(args) -> None:
308
310
  set_jobset_on_cluster_code = set_jobset_on_cluster(args)
309
311
  if set_jobset_on_cluster_code != 0:
310
312
  xpk_exit(set_jobset_on_cluster_code)
313
+ update_jobset_resources_code = update_jobset_resources_if_necessary(args)
314
+ if update_jobset_resources_code != 0:
315
+ xpk_exit(update_jobset_resources_code)
311
316
 
312
317
  set_pathways_job_on_cluster_code = set_pathways_job_on_cluster(args)
313
318
  if set_pathways_job_on_cluster_code != 0:
@@ -879,6 +884,10 @@ def run_gke_cluster_create_command(
879
884
  if args.enable_pd_csi_driver:
880
885
  addons.append('GcePersistentDiskCsiDriver')
881
886
 
887
+ if args.enable_lustre_csi_driver:
888
+ addons.append('LustreCsiDriver')
889
+ command += ' --enable-legacy-lustre-port'
890
+
882
891
  if hasattr(args, 'enable_mtc') and args.enable_mtc:
883
892
  addons.append('HighScaleCheckpointing')
884
893
 
@@ -922,6 +931,13 @@ def install_storage_csis(args):
922
931
  if update_cluster_command_code != 0:
923
932
  xpk_exit(update_cluster_command_code)
924
933
 
934
+ if args.enable_lustre_csi_driver:
935
+ update_cluster_command_code = (
936
+ update_cluster_with_lustre_driver_if_necessary(args)
937
+ )
938
+ if update_cluster_command_code != 0:
939
+ xpk_exit(update_cluster_command_code)
940
+
925
941
 
926
942
  def install_kjob(args):
927
943
  xpk_print('Verifying kjob installation')
@@ -957,6 +973,11 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
957
973
  if enable_kueue_credentials_code != 0:
958
974
  xpk_exit(enable_kueue_credentials_code)
959
975
 
976
+ xpk_print('Update Kueue Controller Manager resources')
977
+ update_kueue_resources_code = update_kueue_resources_if_necessary(args)
978
+ if update_kueue_resources_code != 0:
979
+ xpk_exit(update_kueue_resources_code)
980
+
960
981
 
961
982
  def prepare_gpus(args, system: SystemCharacteristics):
962
983
  xpk_print('Installing NCCL Plugin for cluster')
@@ -37,6 +37,7 @@ from ..utils.console import xpk_exit, xpk_print
37
37
  from ..utils.file import ensure_directory_exists
38
38
  from ..utils.network import all_IPs_cidr
39
39
  from ..utils.objects import hash_string
40
+ from ..core.capacity import get_reservation_maintenance_interval, get_reservation_placement_policy
40
41
 
41
42
  blueprints_path = os.path.abspath('xpkclusters/blueprints')
42
43
  gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
@@ -234,6 +235,30 @@ def generate_blueprint(
234
235
  if args.device_type in supported_device_types:
235
236
  if args.device_type == a3mega_device_type:
236
237
  num_nodes = args.num_nodes if not args.num_nodes is None else 2
238
+
239
+ maintenance_interval = (
240
+ get_reservation_maintenance_interval(
241
+ args.reservation, args.zone, args.project
242
+ )
243
+ if args.reservation is not None
244
+ else 'PERIODIC'
245
+ )
246
+ placement_policy_name = (
247
+ get_reservation_placement_policy(
248
+ args.reservation, args.zone, args.project
249
+ )
250
+ if args.reservation is not None
251
+ else None
252
+ )
253
+ placement_policy = (
254
+ {
255
+ 'type': 'COMPACT',
256
+ 'name': placement_policy_name.split('/')[-1],
257
+ }
258
+ if placement_policy_name is not None
259
+ and len(placement_policy_name) > 0
260
+ else None
261
+ )
237
262
  return bpg.generate_a3_mega_blueprint(
238
263
  blueprint_name=blueprint_name,
239
264
  prefix=prefix,
@@ -243,6 +268,8 @@ def generate_blueprint(
243
268
  zone=args.zone,
244
269
  auth_cidr=all_IPs_cidr,
245
270
  num_nodes=num_nodes,
271
+ reservation_maintenance_interval=maintenance_interval,
272
+ reservation_placement_policy=placement_policy,
246
273
  reservation=args.reservation if args.reservation else None,
247
274
  capacity_type=capacity_type,
248
275
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
xpk/commands/common.py CHANGED
@@ -15,10 +15,12 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..core.commands import run_command_with_updates_retry
18
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
19
18
  from ..core.capacity import H100_MEGA_DEVICE_TYPE, CapacityType
20
19
  from ..core.gcloud_context import zone_to_region
21
20
  from ..utils.console import xpk_print, xpk_exit
21
+ from ..core.system_characteristics import (
22
+ SystemCharacteristics,
23
+ )
22
24
 
23
25
 
24
26
  def set_cluster_command(args) -> int:
@@ -47,7 +49,11 @@ def set_cluster_command(args) -> int:
47
49
  return return_code
48
50
 
49
51
 
50
- def is_TAS_possible(args) -> bool:
52
+ def is_TAS_possible(
53
+ system_characteristics: SystemCharacteristics,
54
+ capacity_type: CapacityType,
55
+ flex: bool,
56
+ ) -> bool:
51
57
  """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
52
58
 
53
59
  Args:
@@ -56,8 +62,6 @@ def is_TAS_possible(args) -> bool:
56
62
  Returns:
57
63
  True if possible and False otherwise.
58
64
  """
59
- system_characteristics = get_cluster_system_characteristics(args)
60
- capacity_type = get_cluster_capacity_type(args)
61
65
 
62
66
  if system_characteristics is None:
63
67
  xpk_print('system_characteristics data was not found in configmaps.')
@@ -67,9 +71,12 @@ def is_TAS_possible(args) -> bool:
67
71
  xpk_print('capacity_type data was not found in configmaps.')
68
72
  xpk_exit(1)
69
73
 
74
+ if flex:
75
+ return False
76
+
70
77
  if (
71
78
  system_characteristics.device_type == H100_MEGA_DEVICE_TYPE
72
- and capacity_type == CapacityType.SPOT
79
+ and capacity_type != CapacityType.RESERVATION
73
80
  ):
74
81
  return False
75
82
 
@@ -27,6 +27,7 @@ from ..core.kjob import (
27
27
  Kueue_TAS_annotation,
28
28
  )
29
29
  from .common import is_TAS_possible
30
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
30
31
 
31
32
 
32
33
  def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
@@ -50,7 +51,9 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
50
51
 
51
52
 
52
53
  def add_TAS_annotations_to_command(args, cmd: str) -> str:
53
- if is_TAS_possible(args):
54
+ system_characteristics = get_cluster_system_characteristics(args)
55
+ capacity_type = get_cluster_capacity_type(args)
56
+ if is_TAS_possible(system_characteristics, capacity_type, flex=False):
54
57
  cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
55
58
 
56
59
  return cmd
xpk/commands/run.py CHANGED
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  from argparse import Namespace
18
18
 
19
19
  from ..core.cluster import (
20
- create_xpk_k8s_service_account,
20
+ setup_k8s_service_accounts,
21
21
  get_cluster_credentials,
22
22
  )
23
23
  from ..core.commands import run_command_with_full_controls
@@ -53,7 +53,7 @@ def run(args: Namespace) -> None:
53
53
  err_code = prepare_kjob(args)
54
54
  if err_code > 0:
55
55
  xpk_exit(err_code)
56
- create_xpk_k8s_service_account()
56
+ setup_k8s_service_accounts()
57
57
 
58
58
  submit_job(args)
59
59
 
xpk/commands/shell.py CHANGED
@@ -12,7 +12,7 @@ limitations under the License.
12
12
  """
13
13
 
14
14
  from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15
- from ..core.cluster import get_cluster_credentials, add_zone_and_project, create_xpk_k8s_service_account
15
+ from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
16
16
  from ..utils.console import xpk_exit, xpk_print
17
17
  from argparse import Namespace
18
18
 
@@ -82,7 +82,7 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
82
82
  err_code = prepare_kjob(args)
83
83
  if err_code > 0:
84
84
  xpk_exit(err_code)
85
- create_xpk_k8s_service_account()
85
+ setup_k8s_service_accounts()
86
86
 
87
87
  cmd = (
88
88
  'kubectl-kjob create interactive --profile'
xpk/commands/storage.py CHANGED
@@ -29,6 +29,7 @@ from ..core.cluster import (
29
29
  setup_k8s_env,
30
30
  update_cluster_with_parallelstore_driver_if_necessary,
31
31
  update_cluster_with_pd_driver_if_necessary,
32
+ update_cluster_with_lustre_driver_if_necessary,
32
33
  update_cluster_with_gcpfilestore_driver_if_necessary,
33
34
  update_cluster_with_gcsfuse_driver_if_necessary,
34
35
  update_cluster_with_workload_identity_if_necessary,
@@ -45,6 +46,7 @@ from ..core.storage import (
45
46
  GCS_FUSE_TYPE,
46
47
  GCE_PD_TYPE,
47
48
  PARALLELSTORE_TYPE,
49
+ LUSTRE_TYPE,
48
50
  STORAGE_CRD_PLURAL,
49
51
  XPK_API_GROUP_NAME,
50
52
  XPK_API_GROUP_VERSION,
@@ -183,11 +185,11 @@ def storage_attach(args: Namespace) -> None:
183
185
  args.prefetch_metadata,
184
186
  )
185
187
 
186
- elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
188
+ elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE]:
187
189
  if args.manifest is None:
188
190
  xpk_print(
189
- "Parallelstore and PersistentDisk are currently supported only with"
190
- " --manifest"
191
+ "Parallelstore, PersistentDisk, and Lustre are currently supported"
192
+ " only with --manifest"
191
193
  )
192
194
  xpk_exit(1)
193
195
 
@@ -234,6 +236,11 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
234
236
  if return_code > 0:
235
237
  xpk_exit(return_code)
236
238
 
239
+ if args.type == LUSTRE_TYPE:
240
+ return_code = update_cluster_with_lustre_driver_if_necessary(args)
241
+ if return_code > 0:
242
+ xpk_exit(return_code)
243
+
237
244
 
238
245
  def storage_list(args: Namespace) -> None:
239
246
  k8s_api_client = setup_k8s_env(args)
xpk/commands/workload.py CHANGED
@@ -14,23 +14,25 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from ..core.blueprint.blueprint_generator import (
18
+ a3high_device_type,
19
+ a3mega_device_type,
20
+ a3ultra_device_type,
21
+ a4_device_type,
22
+ )
17
23
  from ..core.cluster import (
18
24
  XPK_SA,
19
- create_xpk_k8s_service_account,
25
+ setup_k8s_service_accounts,
20
26
  get_cluster_credentials,
21
27
  setup_k8s_env,
22
28
  )
23
29
  from ..core.commands import run_command_with_updates, run_commands
24
- from ..core.config import (
25
- VERTEX_TENSORBOARD_FEATURE_FLAG,
26
- XPK_CURRENT_VERSION,
27
- parse_env_config,
28
- )
30
+ from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
29
31
  from ..core.docker_container import (
30
32
  get_main_container_docker_image,
31
33
  get_user_workload_container,
32
34
  )
33
- from ..core.docker_resources import get_volumes
35
+ from ..core.docker_resources import get_volumes, parse_env_config
34
36
  from ..core.gcloud_context import add_zone_and_project
35
37
  from ..core.kueue import LOCAL_QUEUE_NAME
36
38
  from ..core.monitoring import get_gke_outlier_dashboard
@@ -50,6 +52,10 @@ from ..core.pathways import (
50
52
  get_user_workload_for_pathways,
51
53
  try_to_delete_pathwaysjob_first,
52
54
  )
55
+ from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
56
+ from ..core.capacity import (
57
+ CapacityType,
58
+ )
53
59
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
54
60
  from ..core.scheduling import (
55
61
  check_if_workload_can_schedule,
@@ -65,6 +71,7 @@ from ..core.storage import (
65
71
  GCP_FILESTORE_TYPE,
66
72
  GCS_FUSE_TYPE,
67
73
  PARALLELSTORE_TYPE,
74
+ LUSTRE_TYPE,
68
75
  Storage,
69
76
  add_bucket_iam_members,
70
77
  get_storage_annotations,
@@ -76,7 +83,6 @@ from ..core.system_characteristics import (
76
83
  )
77
84
  from ..core.vertex import create_vertex_experiment
78
85
  from ..core.workload import (
79
- add_gpu_rxdm_container,
80
86
  check_if_workload_exists,
81
87
  get_workload_list,
82
88
  wait_for_job_completion,
@@ -85,12 +91,13 @@ from ..core.workload import (
85
91
  from ..core.workload_decorators import (
86
92
  rdma_decorator,
87
93
  storage_decorator,
94
+ tcpx_decorator,
88
95
  tcpxo_decorator,
89
96
  )
90
97
  from ..utils.console import get_user_input, xpk_exit, xpk_print
91
98
  from ..utils.file import write_tmp_file
92
- from .common import is_TAS_possible
93
99
  from . import cluster_gcluster
100
+ from .common import is_TAS_possible
94
101
 
95
102
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
96
103
  kind: JobSet
@@ -123,6 +130,8 @@ spec:
123
130
  {storage_annotations}
124
131
  spec:
125
132
  schedulerName: {args.scheduler}
133
+ imagePullSecrets:
134
+ - name: {args.docker_image_pull_secret}
126
135
  restartPolicy: Never
127
136
  {affinity}
128
137
  nodeSelector:
@@ -136,6 +145,8 @@ spec:
136
145
  containers:
137
146
  {container}
138
147
  serviceAccountName: {service_account}
148
+ tolerations:
149
+ {tpu_toleration}
139
150
  volumes:
140
151
  {volumes}
141
152
  """
@@ -175,6 +186,8 @@ spec:
175
186
  {gpu_scheduler}
176
187
  priorityClassName: {args.priority}
177
188
  restartPolicy: Never
189
+ imagePullSecrets:
190
+ - name: {args.docker_image_pull_secret}
178
191
  hostNetwork: true
179
192
  dnsPolicy: ClusterFirstWithHostNet
180
193
  terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
@@ -213,11 +226,12 @@ spec:
213
226
  metadata:
214
227
  labels:
215
228
  xpk.google.com/workload: {args.workload}
216
- annotations:
217
- {kueue_TAS_annotation}
229
+ annotations: {annotations}
218
230
  spec:
219
231
  priorityClassName: {args.priority}
220
232
  restartPolicy: Never
233
+ imagePullSecrets:
234
+ - name: {args.docker_image_pull_secret}
221
235
  dnsPolicy: ClusterFirstWithHostNet
222
236
  terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
223
237
  serviceAccountName: {service_account}
@@ -291,7 +305,7 @@ def workload_create(args) -> None:
291
305
  0 if successful and 1 otherwise.
292
306
  """
293
307
  k8s_api_client = setup_k8s_env(args)
294
- create_xpk_k8s_service_account()
308
+ setup_k8s_service_accounts()
295
309
 
296
310
  workload_exists = check_if_workload_exists(args)
297
311
 
@@ -347,7 +361,7 @@ def workload_create(args) -> None:
347
361
  if not tensorboard_config:
348
362
  xpk_exit(1)
349
363
 
350
- parse_env_config(args, tensorboard_config, system)
364
+ parse_env_config(args, tensorboard_config)
351
365
 
352
366
  autoprovisioning_args = ''
353
367
  autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
@@ -382,6 +396,9 @@ def workload_create(args) -> None:
382
396
  pd_storages: list[Storage] = list(
383
397
  filter(lambda storage: storage.type == GCE_PD_TYPE, storages)
384
398
  )
399
+ lustre_storages: list[Storage] = list(
400
+ filter(lambda storage: storage.type == LUSTRE_TYPE, storages)
401
+ )
385
402
  if len(gcs_fuse_storages) > 0:
386
403
  service_account = XPK_SA
387
404
  xpk_print(f'Detected gcsfuse Storages to add: {gcs_fuse_storages}')
@@ -411,11 +428,18 @@ def workload_create(args) -> None:
411
428
  else:
412
429
  xpk_print('No gce persistent disk instances to add detected.')
413
430
 
431
+ if len(lustre_storages) > 0:
432
+ service_account = XPK_SA
433
+ xpk_print(f'Detected managed lustre instances to add: {lustre_storages}')
434
+ else:
435
+ xpk_print('No managed lustre instances to add detected.')
436
+
414
437
  all_storages = (
415
438
  gcs_fuse_storages
416
439
  + gcpfilestore_storages
417
440
  + parallelstore_storages
418
441
  + pd_storages
442
+ + lustre_storages
419
443
  )
420
444
 
421
445
  # Currently failure policy rules are supported for Pathways workloads. b/408465881
@@ -447,31 +471,41 @@ def workload_create(args) -> None:
447
471
  )
448
472
  if return_code != 0:
449
473
  xpk_exit(return_code)
450
-
451
- kueue_TAS_annotation = (
452
- 'kueue.x-k8s.io/podset-preferred-topology:'
453
- ' "cloud.google.com/gce-topology-host"'
474
+ system_characteristics = get_cluster_system_characteristics(args)
475
+ capacity_type = get_cluster_capacity_type(args)
476
+
477
+ annotations = (
478
+ ''
479
+ if not is_TAS_possible(
480
+ system_characteristics,
481
+ capacity_type,
482
+ flex=True if capacity_type == CapacityType.FLEX_START else False,
483
+ )
484
+ else (
485
+ 'kueue.x-k8s.io/podset-preferred-topology:'
486
+ ' "cloud.google.com/gce-topology-host"'
487
+ )
454
488
  )
455
- if not is_TAS_possible(args):
456
- kueue_TAS_annotation = ''
457
489
 
458
- if system.device_type in cluster_gcluster.supported_device_types:
490
+ if (
491
+ system.device_type in cluster_gcluster.supported_device_types
492
+ or system.device_type == a3high_device_type
493
+ ):
459
494
  yml_string = A3_GPU_WORKLOAD_CREATE_YAML.format(
460
495
  args=args,
461
496
  container=container,
462
497
  service_account=XPK_SA,
463
498
  failure_policy_rules=failure_policy_rules,
464
499
  pod_failure_policy=pod_failure_policy,
465
- kueue_TAS_annotation=kueue_TAS_annotation,
500
+ annotations=annotations,
466
501
  )
467
502
 
468
503
  sub_networks = get_cluster_subnetworks(args)
469
- if args.device_type == cluster_gcluster.a3mega_device_type:
504
+ if args.device_type == a3high_device_type:
505
+ yml_string = tcpx_decorator.decorate_jobset(yml_string)
506
+ elif args.device_type == a3mega_device_type:
470
507
  yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
471
- elif args.device_type in [
472
- cluster_gcluster.a3ultra_device_type,
473
- cluster_gcluster.a4_device_type,
474
- ]:
508
+ elif args.device_type in [a3ultra_device_type, a4_device_type]:
475
509
  yml_string = rdma_decorator.decorate_jobset(yml_string, sub_networks)
476
510
 
477
511
  if all_storages:
@@ -489,7 +523,6 @@ def workload_create(args) -> None:
489
523
  failure_policy_rules=failure_policy_rules,
490
524
  pod_failure_policy=pod_failure_policy,
491
525
  )
492
- yml_string = add_gpu_rxdm_container(yml_string, system, all_storages)
493
526
 
494
527
  elif args.use_pathways and ensure_pathways_workload_prerequisites(
495
528
  args, system
@@ -526,6 +559,10 @@ def workload_create(args) -> None:
526
559
  get_storage_annotations(all_storages)
527
560
  ),
528
561
  service_account=service_account,
562
+ tpu_toleration="""
563
+ - operator: "Exists"
564
+ key: google.com/tpu
565
+ """ if system.accelerator_type == AcceleratorType['TPU'] else '',
529
566
  failure_policy_rules=failure_policy_rules,
530
567
  pod_failure_policy=pod_failure_policy,
531
568
  )