xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -41,17 +41,12 @@ from ..core.gcloud_context import (
41
41
  add_zone_and_project,
42
42
  get_gke_control_plane_version,
43
43
  get_gke_server_config,
44
+ get_cluster_location,
44
45
  zone_to_region,
45
46
  )
46
47
  from ..core.jobset import update_jobset_resources_if_necessary
47
48
  from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
48
- from ..core.kueue import (
49
- cluster_preheat_yml,
50
- install_kueue_crs,
51
- install_kueue_on_cluster,
52
- wait_for_kueue_available,
53
- update_kueue_resources_if_necessary,
54
- )
49
+ from ..core.kueue_manager import (KueueConfig, KueueManager)
55
50
  from ..core.nap import enable_autoprovisioning_on_cluster
56
51
  from ..core.network import (
57
52
  create_cluster_network_config,
@@ -65,6 +60,7 @@ from ..core.nodepool import (
65
60
  from ..core.ray import install_ray_cluster
66
61
  from ..core.mtc import install_mtc_on_cluster
67
62
  from ..core.resources import create_cluster_configmaps
63
+ from ..core.scheduling import get_total_chips_requested_from_args
68
64
  from ..core.storage import install_storage_crd
69
65
  from ..core.system_characteristics import (
70
66
  AcceleratorType,
@@ -76,11 +72,17 @@ from ..core.vertex import create_vertex_tensorboard
76
72
  from ..core.workload import get_workload_list
77
73
  from ..utils.console import get_user_input, xpk_exit, xpk_print
78
74
  from ..utils.file import write_tmp_file
75
+ from ..utils.execution_context import is_dry_run
76
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
79
77
  from . import cluster_gcluster
80
78
  from .common import set_cluster_command
79
+ from jinja2 import Environment, FileSystemLoader
80
+ from ..utils.templates import TEMPLATE_PATH
81
81
  import shutil
82
82
  import os
83
83
 
84
+ CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
85
+
84
86
 
85
87
  def cluster_adapt(args) -> None:
86
88
  """Function that performs cluster adaptation.
@@ -88,6 +90,12 @@ def cluster_adapt(args) -> None:
88
90
  Args:
89
91
  args: user provided arguments for running the command.
90
92
  """
93
+ if should_validate_dependencies(args):
94
+ validate_dependencies_list([
95
+ SystemDependency.KUBECTL,
96
+ SystemDependency.KJOB,
97
+ SystemDependency.GCLOUD,
98
+ ])
91
99
  args.enable_pathways = False
92
100
 
93
101
  system, return_code = get_system_characteristics(args)
@@ -108,7 +116,7 @@ def cluster_adapt(args) -> None:
108
116
  'Argument --num-nodes was not provided, trying to determine number of'
109
117
  ' nodes based on the available nodes in the cluster...'
110
118
  )
111
- args.num_nodes = count_nodes_on_cluster(args, system)
119
+ args.num_nodes = count_nodes_on_cluster(system)
112
120
  if args.num_nodes == 0:
113
121
  xpk_print(
114
122
  'Found unexpected number of nodes. Is the --device-type correct?'
@@ -128,9 +136,10 @@ def cluster_adapt(args) -> None:
128
136
 
129
137
  get_cluster_credentials(args)
130
138
 
131
- k8s_client = setup_k8s_env(args)
139
+ if not is_dry_run():
140
+ k8s_client = setup_k8s_env(args)
141
+ install_storage_crd(k8s_client)
132
142
 
133
- install_storage_crd(k8s_client)
134
143
  install_storage_csis(args)
135
144
 
136
145
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
@@ -174,7 +183,7 @@ def cluster_adapt(args) -> None:
174
183
 
175
184
  install_kjob(args)
176
185
  if system.accelerator_type == AcceleratorType['GPU']:
177
- prepare_gpus(args, system)
186
+ prepare_gpus(system)
178
187
 
179
188
  if args.enable_ray_cluster:
180
189
  return_code = install_ray_cluster(args, system)
@@ -186,7 +195,7 @@ def cluster_adapt(args) -> None:
186
195
  xpk_print(
187
196
  'See your GKE Cluster here:'
188
197
  # pylint: disable=line-too-long
189
- f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
198
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
190
199
  )
191
200
  xpk_exit(0)
192
201
 
@@ -197,6 +206,12 @@ def cluster_create(args) -> None:
197
206
  Args:
198
207
  args: user provided arguments for running the command.
199
208
  """
209
+ if should_validate_dependencies(args):
210
+ validate_dependencies_list([
211
+ SystemDependency.KUBECTL,
212
+ SystemDependency.KJOB,
213
+ SystemDependency.GCLOUD,
214
+ ])
200
215
  system, return_code = get_system_characteristics(args)
201
216
 
202
217
  if return_code > 0 or system is None:
@@ -247,13 +262,14 @@ def cluster_create(args) -> None:
247
262
 
248
263
  get_cluster_credentials(args)
249
264
 
250
- update_coredns_command_code = update_coredns_if_necessary(args)
265
+ update_coredns_command_code = update_coredns_if_necessary()
251
266
  if update_coredns_command_code != 0:
252
267
  xpk_exit(update_cluster_command_code)
253
268
 
254
- k8s_client = setup_k8s_env(args)
269
+ if not is_dry_run():
270
+ k8s_client = setup_k8s_env(args)
271
+ install_storage_crd(k8s_client)
255
272
 
256
- install_storage_crd(k8s_client)
257
273
  install_storage_csis(args)
258
274
 
259
275
  # create Vertex Tensorboard for new and existing clusters if create-vertex-tensorboard is set
@@ -314,7 +330,7 @@ def cluster_create(args) -> None:
314
330
  set_jobset_on_cluster_code = set_jobset_on_cluster(args)
315
331
  if set_jobset_on_cluster_code != 0:
316
332
  xpk_exit(set_jobset_on_cluster_code)
317
- update_jobset_resources_code = update_jobset_resources_if_necessary(args)
333
+ update_jobset_resources_code = update_jobset_resources_if_necessary()
318
334
  if update_jobset_resources_code != 0:
319
335
  xpk_exit(update_jobset_resources_code)
320
336
 
@@ -327,7 +343,7 @@ def cluster_create(args) -> None:
327
343
  install_kjob(args)
328
344
 
329
345
  if system.accelerator_type == AcceleratorType['GPU']:
330
- prepare_gpus(args, system)
346
+ prepare_gpus(system)
331
347
 
332
348
  if args.enable_ray_cluster:
333
349
  return_code = install_ray_cluster(args, system)
@@ -345,7 +361,7 @@ def cluster_create(args) -> None:
345
361
  xpk_print(
346
362
  'See your GKE Cluster here:'
347
363
  # pylint: disable=line-too-long
348
- f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
364
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
349
365
  )
350
366
  xpk_exit(0)
351
367
 
@@ -359,6 +375,8 @@ def cluster_delete(args) -> None:
359
375
  Returns:
360
376
  0 if successful and 1 otherwise.
361
377
  """
378
+ if should_validate_dependencies(args):
379
+ validate_dependencies_list([SystemDependency.GCLOUD])
362
380
  xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
363
381
  add_zone_and_project(args)
364
382
 
@@ -388,6 +406,10 @@ def cluster_cacheimage(args) -> None:
388
406
  Returns:
389
407
  0 if successful and 1 otherwise.
390
408
  """
409
+ if should_validate_dependencies(args):
410
+ validate_dependencies_list(
411
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
412
+ )
391
413
  xpk_print(
392
414
  f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
393
415
  )
@@ -403,27 +425,26 @@ def cluster_cacheimage(args) -> None:
403
425
  node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
404
426
  system.accelerator_type
405
427
  ].accelerator_label
406
- yml_string = cluster_preheat_yml.format(
428
+
429
+ template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
430
+ cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
431
+ rendered_yaml = cluster_preheat_yaml.render(
407
432
  cachekey=args.cache_key,
408
433
  image_name=args.docker_image,
409
434
  nodeSelectorKey=node_selector_key,
410
435
  )
411
- tmp = write_tmp_file(yml_string)
412
- command_apply = f'kubectl apply -f {str(tmp.file.name)}'
413
- command_delete = (
414
- f'kubectl delete -f {str(tmp.file.name)} --ignore-not-found=true'
415
- )
436
+ tmp = write_tmp_file(rendered_yaml)
437
+ command_apply = f'kubectl apply -f {str(tmp)}'
438
+ command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
416
439
 
417
440
  return_code = run_command_with_updates(
418
- command_delete, 'Deleting Cached Image', args
441
+ command_delete, 'Deleting Cached Image'
419
442
  )
420
443
  if return_code != 0:
421
444
  xpk_print(f'Delete Cached Image returned ERROR {return_code}')
422
445
  xpk_exit(return_code)
423
446
 
424
- return_code = run_command_with_updates(
425
- command_apply, 'Creating Cached Image', args
426
- )
447
+ return_code = run_command_with_updates(command_apply, 'Creating Cached Image')
427
448
  if return_code != 0:
428
449
  xpk_print(f'Create Cached Image returned ERROR {return_code}')
429
450
  xpk_exit(return_code)
@@ -439,12 +460,16 @@ def cluster_describe(args) -> None:
439
460
  Returns:
440
461
  0 if successful and 1 otherwise.
441
462
  """
463
+ if should_validate_dependencies(args):
464
+ validate_dependencies_list(
465
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
466
+ )
442
467
  xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
443
468
  add_zone_and_project(args)
444
469
 
445
470
  get_cluster_credentials(args)
446
471
 
447
- return_code, data_table = nodepools_build_table(args)
472
+ return_code, data_table = nodepools_build_table()
448
473
  if return_code != 0:
449
474
  xpk_exit(return_code)
450
475
 
@@ -460,7 +485,6 @@ def cluster_describe(args) -> None:
460
485
  r'kubectl get node --no-headers=true'
461
486
  r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
462
487
  'Count TPU Nodes',
463
- args,
464
488
  )
465
489
  if return_code_node_output != 0:
466
490
  xpk_exit(return_code_node_output)
@@ -471,7 +495,6 @@ def cluster_describe(args) -> None:
471
495
  "kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
472
496
  ' Running | wc -l',
473
497
  'Count TPU Pods',
474
- args,
475
498
  )
476
499
  if return_code_pod_output != 0:
477
500
  xpk_exit(return_code_pod_output)
@@ -486,7 +509,7 @@ def cluster_describe(args) -> None:
486
509
  xpk_exit(0)
487
510
 
488
511
 
489
- def nodepools_build_table(args) -> tuple[int, list[list]]:
512
+ def nodepools_build_table() -> tuple[int, list[list]]:
490
513
  table = [[
491
514
  'NODEPOOL_NAME',
492
515
  'SLICE',
@@ -498,14 +521,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
498
521
 
499
522
  nodepools_data = {}
500
523
 
501
- nodepools, return_code = get_node_pools_name(args)
524
+ nodepools, return_code = get_node_pools_name()
502
525
  if return_code != 0:
503
526
  xpk_print(f'Get node pools name returned ERROR {return_code}')
504
527
 
505
528
  for name in nodepools:
506
529
  nodepools_data[name] = [name]
507
530
 
508
- slices, return_code = get_slice_node_pool_size(args)
531
+ slices, return_code = get_slice_node_pool_size()
509
532
  if return_code != 0:
510
533
  xpk_print(f'Get slice node pool size returned ERROR {return_code}')
511
534
 
@@ -514,7 +537,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
514
537
  count, nodepool_name = s[0], s[1]
515
538
  nodepools_data[nodepool_name].append(count)
516
539
 
517
- type_nodepool, return_code = get_node_pool_instance_type(args)
540
+ type_nodepool, return_code = get_node_pool_instance_type()
518
541
  if return_code != 0:
519
542
  xpk_print(f'Get node pool instance type returned ERROR {return_code}')
520
543
 
@@ -523,7 +546,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
523
546
  nodepool_name, instance_type = tn[0], tn[1]
524
547
  nodepools_data[nodepool_name].append(instance_type)
525
548
 
526
- expected_healthy_nodes, return_code = get_expected_healthy_nodes(args)
549
+ expected_healthy_nodes, return_code = get_expected_healthy_nodes()
527
550
  if return_code != 0:
528
551
  xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
529
552
 
@@ -532,7 +555,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
532
555
  count, nodepool_name = ehn[0], ehn[1]
533
556
  nodepools_data[nodepool_name].append(count)
534
557
 
535
- actual_healthy_nodes, return_code = get_actual_healthy_nodes(args)
558
+ actual_healthy_nodes, return_code = get_actual_healthy_nodes()
536
559
  if return_code != 0:
537
560
  xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
538
561
 
@@ -541,7 +564,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
541
564
  count, nodepool_name = ahn[0], ahn[1]
542
565
  nodepools_data[nodepool_name].append(count)
543
566
 
544
- total_nodes, return_code = get_total_nodes_per_node_pool(args)
567
+ total_nodes, return_code = get_total_nodes_per_node_pool()
545
568
  if return_code != 0:
546
569
  xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
547
570
 
@@ -556,20 +579,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
556
579
  return 0, table
557
580
 
558
581
 
559
- def get_node_pools_name(args) -> tuple[list[str], int]:
582
+ def get_node_pools_name() -> tuple[list[str], int]:
560
583
  cmd_nodepools = (
561
584
  'kubectl get node --no-headers=true -o'
562
585
  " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
563
586
  " | grep -v 'none' | sort | uniq"
564
587
  )
565
- return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args)
588
+ return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list')
566
589
  if return_code != 0:
567
590
  return [], return_code
568
591
 
569
592
  return out.splitlines(), 0
570
593
 
571
594
 
572
- def get_slice_node_pool_size(args) -> tuple[list[str], int]:
595
+ def get_slice_node_pool_size() -> tuple[list[str], int]:
573
596
  cmd_slices = (
574
597
  'kubectl get node --no-headers=true -o'
575
598
  " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
@@ -578,7 +601,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
578
601
  ' | uniq -c'
579
602
  )
580
603
  return_code, out = run_command_for_value(
581
- cmd_slices, 'Count nodes per nodepool slice', args
604
+ cmd_slices, 'Count nodes per nodepool slice'
582
605
  )
583
606
  if return_code != 0:
584
607
  return [], return_code
@@ -586,7 +609,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
586
609
  return out.splitlines(), 0
587
610
 
588
611
 
589
- def get_node_pool_instance_type(args) -> tuple[list[str], int]:
612
+ def get_node_pool_instance_type() -> tuple[list[str], int]:
590
613
  cmd_type_nodepool = (
591
614
  'kubectl get node --no-headers=true -o'
592
615
  " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
@@ -594,7 +617,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
594
617
  " 'none' | sort | uniq"
595
618
  )
596
619
  return_code, out = run_command_for_value(
597
- cmd_type_nodepool, 'Instance type of nodepools', args
620
+ cmd_type_nodepool, 'Instance type of nodepools'
598
621
  )
599
622
  if return_code != 0:
600
623
  return [], return_code
@@ -602,7 +625,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
602
625
  return out.splitlines(), 0
603
626
 
604
627
 
605
- def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
628
+ def get_expected_healthy_nodes() -> tuple[list[str], int]:
606
629
  cmd_expected_healthy_nodes = (
607
630
  'kubectl get node --no-headers=true -o'
608
631
  " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
@@ -613,7 +636,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
613
636
  return_code, out = run_command_for_value(
614
637
  cmd_expected_healthy_nodes,
615
638
  'Count expected healthy nodes per nodepool',
616
- args,
617
639
  )
618
640
  if return_code != 0:
619
641
  return [], return_code
@@ -621,7 +643,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
621
643
  return out.splitlines(), 0
622
644
 
623
645
 
624
- def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
646
+ def get_actual_healthy_nodes() -> tuple[list[str], int]:
625
647
  cmd_actual_healthy_nodes = (
626
648
  'kubectl get node --no-headers=true -o'
627
649
  " custom-columns='NODE_NAME:metadata.name,"
@@ -634,7 +656,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
634
656
  ' | uniq -c'
635
657
  )
636
658
  return_code, out = run_command_for_value(
637
- cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args
659
+ cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
638
660
  )
639
661
  if return_code != 0:
640
662
  return [], return_code
@@ -642,7 +664,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
642
664
  return out.splitlines(), 0
643
665
 
644
666
 
645
- def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
667
+ def get_total_nodes_per_node_pool() -> tuple[list[str], int]:
646
668
  cmd_total_nodes = (
647
669
  'kubectl get node --no-headers=true -o'
648
670
  " custom-columns='NODE_NAME:metadata.name,"
@@ -654,7 +676,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
654
676
  ' | uniq -c'
655
677
  )
656
678
  return_code, out = run_command_for_value(
657
- cmd_total_nodes, 'Count total nodes per nodepool', args
679
+ cmd_total_nodes, 'Count total nodes per nodepool'
658
680
  )
659
681
  if return_code != 0:
660
682
  return [], return_code
@@ -671,6 +693,8 @@ def cluster_list(args) -> None:
671
693
  Returns:
672
694
  0 if successful and 1 otherwise.
673
695
  """
696
+ if should_validate_dependencies(args):
697
+ validate_dependencies_list([SystemDependency.GCLOUD])
674
698
  add_zone_and_project(args)
675
699
  xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
676
700
  if run_gke_clusters_list_command(args):
@@ -706,20 +730,20 @@ def cluster_create_ray_cluster(args) -> None:
706
730
  cluster_create(args)
707
731
 
708
732
 
709
- def install_jq(args):
733
+ def install_jq():
710
734
  """Installs 'jq' utility."""
711
735
  if shutil.which('jq'):
712
736
  xpk_print("Task: 'Install jq' skipped, jq already installed.")
713
737
  return
714
738
  command_jq_install = 'sudo apt install jq -y'
715
739
  xpk_print("Task: 'Install jq' in progress.")
716
- return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
740
+ return_code = run_command_with_updates(command_jq_install, 'Install jq')
717
741
  if return_code != 0:
718
742
  xpk_print(f'Install jq error {return_code}')
719
743
  xpk_exit(return_code)
720
744
 
721
745
 
722
- def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
746
+ def clone_coredns_deployment_repo(coredns_repo_full_path: str):
723
747
  """Clones the CoreDNS deployment repository if it doesn't exist."""
724
748
  if os.path.exists(coredns_repo_full_path):
725
749
  xpk_print(
@@ -734,15 +758,13 @@ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
734
758
  "Task: 'Clone deployment' in progress, Target"
735
759
  f' directory:{coredns_repo_full_path}.'
736
760
  )
737
- return_code = run_command_with_updates(
738
- command_git_clone, 'Clone deployment', args
739
- )
761
+ return_code = run_command_with_updates(command_git_clone, 'Clone deployment')
740
762
  if return_code != 0:
741
763
  xpk_print(f'Clone deployment error {return_code}')
742
764
  xpk_exit(return_code)
743
765
 
744
766
 
745
- def deploy_coredns_manifests(args, coredns_k8s_path: str):
767
+ def deploy_coredns_manifests(coredns_k8s_path: str):
746
768
  """Deploys CoreDNS manifests to the cluster."""
747
769
  if not os.path.isdir(coredns_k8s_path):
748
770
  xpk_print(
@@ -760,7 +782,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
760
782
  f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
761
783
  )
762
784
  return_code = run_command_with_updates(
763
- command_deploy_coredns, 'Deploy CoreDNS', args
785
+ command_deploy_coredns, 'Deploy CoreDNS'
764
786
  )
765
787
  if return_code != 0:
766
788
  xpk_print(f'Deploy CoreDNS error {return_code}')
@@ -772,9 +794,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
772
794
  xpk_exit(return_code)
773
795
 
774
796
 
775
- def scale_down_deployment(
776
- args, deployment_name: str, namespace: str = 'kube-system'
777
- ):
797
+ def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
778
798
  """Scales down a specified Kubernetes deployment to 0 replicas."""
779
799
  command = (
780
800
  f'kubectl scale deployment {deployment_name} --replicas=0'
@@ -782,29 +802,27 @@ def scale_down_deployment(
782
802
  )
783
803
  xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
784
804
  return_code = run_command_with_updates(
785
- command, f'Scale down {deployment_name}', args
805
+ command, f'Scale down {deployment_name}'
786
806
  )
787
807
  if return_code != 0:
788
808
  xpk_print(f'Scale down {deployment_name} error {return_code}')
789
809
  xpk_exit(return_code)
790
- xpk_print(f'\n{deployment_name} has been scaled down.')
810
+ xpk_print(f'{deployment_name} has been scaled down.')
791
811
 
792
812
 
793
- def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
813
+ def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
794
814
  """Scales up the CoreDNS deployment to a specified number of replicas."""
795
815
  command_coredns_scale = (
796
816
  f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
797
817
  )
798
818
  xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
799
- return_code = run_command_with_updates(
800
- command_coredns_scale, 'Scale CoreDNS', args
801
- )
819
+ return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS')
802
820
  if return_code != 0:
803
821
  xpk_print(f'Scale CoreDNS error {return_code}')
804
822
  xpk_exit(return_code)
805
823
 
806
824
 
807
- def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
825
+ def check_deployment_exists(deployment_name: str, namespace: str) -> bool:
808
826
  """Check for the existence of a specific Deployment in a given namespace."""
809
827
  # TODO: rewrite this to be more obvious, check if it is correct
810
828
  command = (
@@ -812,17 +830,17 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
812
830
  f' {namespace} --ignore-not-found'
813
831
  )
814
832
  result = run_command_with_updates(
815
- command, 'Waiting for kubeDNS to be checked.', args
833
+ command, 'Waiting for kubeDNS to be checked.'
816
834
  )
817
835
  return result != 0
818
836
 
819
837
 
820
838
  def verify_coredns_readiness(
821
- args, timeout: int = 240, namespace: str = 'kube-system'
839
+ timeout: int = 240, namespace: str = 'kube-system'
822
840
  ):
823
841
  """Verifies CoreDNS readiness using kubectl wait commands."""
824
842
  xpk_print('Now verifying CoreDNS readiness...')
825
- kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
843
+ kube_dns_exists = check_deployment_exists('kube-dns', namespace)
826
844
  if kube_dns_exists:
827
845
  # Wait for kube-dns to be fully scaled down
828
846
  command_kube_dns_wait_scaled_down = (
@@ -832,7 +850,7 @@ def verify_coredns_readiness(
832
850
  )
833
851
  xpk_print('Verifying if kube-dns has scaled down...')
834
852
  return_code_kube_dns = run_command_with_updates(
835
- command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
853
+ command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
836
854
  )
837
855
  if return_code_kube_dns != 0:
838
856
  xpk_print('kube-dns did not scale down successfully within the timeout.')
@@ -848,7 +866,7 @@ def verify_coredns_readiness(
848
866
  )
849
867
  xpk_print('Verifying if CoreDNS is available...')
850
868
  return_code_coredns = run_command_with_updates(
851
- command_coredns_wait_available, 'Wait for coredns available', args
869
+ command_coredns_wait_available, 'Wait for coredns available'
852
870
  )
853
871
  if return_code_coredns != 0:
854
872
  xpk_print(
@@ -873,12 +891,9 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
873
891
  xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
874
892
 
875
893
 
876
- def update_coredns(args) -> int:
894
+ def update_coredns() -> int:
877
895
  """Updates and deploys CoreDNS within a cluster.
878
896
 
879
- Args:
880
- args: user provided arguments for running the command.
881
-
882
897
  Returns:
883
898
  0 if successful and 1 otherwise.
884
899
  """
@@ -887,23 +902,23 @@ def update_coredns(args) -> int:
887
902
  coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
888
903
  coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
889
904
  # 1. Install jq
890
- install_jq(args)
905
+ install_jq()
891
906
 
892
907
  # 2. Clone CoreDNS deployment repository
893
- clone_coredns_deployment_repo(args, coredns_repo_full_path)
908
+ clone_coredns_deployment_repo(coredns_repo_full_path)
894
909
 
895
910
  # 3. Deploy CoreDNS to the cluster
896
- deploy_coredns_manifests(args, coredns_k8s_path)
911
+ deploy_coredns_manifests(coredns_k8s_path)
897
912
 
898
913
  # 4. Scale down kube-dns-autoscaler
899
- scale_down_deployment(args, 'kube-dns-autoscaler')
914
+ scale_down_deployment('kube-dns-autoscaler')
900
915
 
901
916
  # 5. Scale down kube-dns
902
- scale_down_deployment(args, 'kube-dns')
917
+ scale_down_deployment('kube-dns')
903
918
 
904
919
  # 6. Scale up coredns and verify readiness
905
- scale_up_coredns(args, replicas=15)
906
- verify_coredns_readiness(args, timeout=120)
920
+ scale_up_coredns(replicas=15)
921
+ verify_coredns_readiness(timeout=120)
907
922
 
908
923
  xpk_print('The CoreDNS setup process has been completed.')
909
924
 
@@ -913,7 +928,7 @@ def update_coredns(args) -> int:
913
928
  return 0
914
929
 
915
930
 
916
- def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
931
+ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
917
932
  """Checks if the CoreDNS deployment exists in the given namespace.
918
933
 
919
934
  Args:
@@ -928,10 +943,10 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
928
943
  f' namespace: {namespace}'
929
944
  )
930
945
  return_code = run_command_with_updates(
931
- command, f'Check CoreDNS deployment in {namespace}', args
946
+ command, f'Check CoreDNS deployment in {namespace}'
932
947
  )
933
948
  if return_code == 0:
934
- verify_coredns_readiness(args)
949
+ verify_coredns_readiness()
935
950
  xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
936
951
  return True
937
952
  else:
@@ -942,25 +957,22 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
942
957
  return False
943
958
 
944
959
 
945
- def update_coredns_if_necessary(args) -> int:
960
+ def update_coredns_if_necessary() -> int:
946
961
  """Updates and deploys CoreDNS within the cluster if it's not already present.
947
962
 
948
963
  This function checks for the existence of the CoreDNS deployment.
949
964
  If it's not found, it proceeds to deploy and configure CoreDNS.
950
965
 
951
- Args:
952
- args: User-provided arguments for running the command.
953
-
954
966
  Returns:
955
967
  0 if successful (CoreDNS was already present or successfully deployed),
956
968
  and 1 otherwise.
957
969
  """
958
- if coredns_deployment_exists(args, namespace='kube-system'):
970
+ if coredns_deployment_exists(namespace='kube-system'):
959
971
  xpk_print('Skipping CoreDNS deployment since it already exists.')
960
972
  return 0
961
973
  else:
962
974
  xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
963
- return update_coredns(args)
975
+ return update_coredns()
964
976
 
965
977
 
966
978
  def create_cluster_if_necessary(
@@ -1020,10 +1032,10 @@ def run_gke_cluster_delete_command(args) -> int:
1020
1032
  command = (
1021
1033
  'gcloud beta container clusters delete'
1022
1034
  f' {args.cluster} --project={args.project}'
1023
- f' --region={zone_to_region(args.zone)} --quiet'
1035
+ f' --location={get_cluster_location(args.project, args.cluster, args.zone)} --quiet'
1024
1036
  )
1025
1037
 
1026
- return_code = run_command_with_updates(command, 'Cluster Delete', args)
1038
+ return_code = run_command_with_updates(command, 'Cluster Delete')
1027
1039
  if return_code != 0:
1028
1040
  xpk_print(f'Cluster delete request returned ERROR {return_code}')
1029
1041
  return 1
@@ -1046,9 +1058,9 @@ def run_gke_clusters_list_command(args) -> int:
1046
1058
  """
1047
1059
  command = (
1048
1060
  'gcloud container clusters list'
1049
- f' --project={args.project} --region={zone_to_region(args.zone)}'
1061
+ f' --project={args.project} --filter=location~"{zone_to_region(args.zone)}.*"'
1050
1062
  )
1051
- return_code = run_command_with_updates(command, 'Cluster List', args)
1063
+ return_code = run_command_with_updates(command, 'Cluster List')
1052
1064
  if return_code != 0:
1053
1065
  xpk_print(f'Cluster list request returned ERROR {return_code}')
1054
1066
  return 1
@@ -1104,6 +1116,7 @@ def run_gke_cluster_create_command(
1104
1116
  f' {rapid_release_cmd}'
1105
1117
  ' --enable-dns-access'
1106
1118
  ' --autoscaling-profile=optimize-utilization'
1119
+ ' --labels=gke_product_type=xpk'
1107
1120
  )
1108
1121
 
1109
1122
  enable_ip_alias = False
@@ -1157,7 +1170,7 @@ def run_gke_cluster_create_command(
1157
1170
  addons_str = ','.join(addons)
1158
1171
  command += f' --addons={addons_str}'
1159
1172
 
1160
- return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
1173
+ return_code = run_command_with_updates(command, 'GKE Cluster Create')
1161
1174
  if return_code != 0:
1162
1175
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
1163
1176
  return 1
@@ -1203,12 +1216,12 @@ def install_storage_csis(args):
1203
1216
 
1204
1217
  def install_kjob(args):
1205
1218
  xpk_print('Verifying kjob installation')
1206
- err_code = verify_kjob_installed(args)
1219
+ err_code = verify_kjob_installed()
1207
1220
  if err_code > 0:
1208
1221
  xpk_exit(err_code)
1209
1222
 
1210
1223
  xpk_print('Applying kjob CDRs')
1211
- err_code = apply_kjob_crds(args)
1224
+ err_code = apply_kjob_crds()
1212
1225
  if err_code > 0:
1213
1226
  xpk_exit(err_code)
1214
1227
 
@@ -1219,42 +1232,43 @@ def install_kjob(args):
1219
1232
 
1220
1233
  def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
1221
1234
  xpk_print('Enabling Kueue on the cluster')
1222
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
1223
- if install_kueue_on_cluster_code != 0:
1224
- xpk_exit(install_kueue_on_cluster_code)
1225
-
1226
- xpk_print('Wait for Kueue to be fully available')
1227
- wait_for_kueue_available_code = wait_for_kueue_available(args)
1228
- if wait_for_kueue_available_code != 0:
1229
- xpk_exit(wait_for_kueue_available_code)
1230
-
1231
- xpk_print('Install Kueue Custom Resources')
1232
- enable_kueue_credentials_code = install_kueue_crs(
1233
- args, system, autoprovisioning_config
1235
+ autoprovisioning_enabled = False
1236
+ if autoprovisioning_config:
1237
+ # Determine total resources available based on autoprovisioning max chips.
1238
+ autoprovisioning_enabled = True
1239
+ total_chips = autoprovisioning_config.maximum_chips
1240
+ else:
1241
+ # Determine total chips based on user specified topology.
1242
+ total_chips = get_total_chips_requested_from_args(args, system)
1243
+ kueue_manager = KueueManager()
1244
+ kueue_manager.install_or_upgrade(
1245
+ KueueConfig(
1246
+ system,
1247
+ total_chips=total_chips,
1248
+ autoprovisioning_enabled=autoprovisioning_enabled,
1249
+ num_slices=args.num_slices,
1250
+ flex=args.flex,
1251
+ memory_limit=args.memory_limit,
1252
+ cpu_limit=args.cpu_limit,
1253
+ is_pathways_cluster=args.enable_pathways,
1254
+ ),
1234
1255
  )
1235
- if enable_kueue_credentials_code != 0:
1236
- xpk_exit(enable_kueue_credentials_code)
1237
-
1238
- xpk_print('Update Kueue Controller Manager resources')
1239
- update_kueue_resources_code = update_kueue_resources_if_necessary(args)
1240
- if update_kueue_resources_code != 0:
1241
- xpk_exit(update_kueue_resources_code)
1242
1256
 
1243
1257
 
1244
- def prepare_gpus(args, system: SystemCharacteristics):
1258
+ def prepare_gpus(system: SystemCharacteristics):
1245
1259
  xpk_print('Installing NCCL Plugin for cluster')
1246
- install_nccl_code = install_nccl_on_cluster(args, system)
1260
+ install_nccl_code = install_nccl_on_cluster(system)
1247
1261
  if install_nccl_code != 0:
1248
1262
  xpk_exit(install_nccl_code)
1249
1263
 
1250
1264
  if system.device_type == H100_DEVICE_TYPE:
1251
1265
  xpk_print('Installing NRI device injector for cluster')
1252
- install_nri_code = install_nri_on_cluster(args)
1266
+ install_nri_code = install_nri_on_cluster()
1253
1267
  if install_nri_code != 0:
1254
1268
  xpk_exit(install_nri_code)
1255
1269
 
1256
1270
  if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
1257
1271
  xpk_print('Disabling MGLRU')
1258
- err_code = disable_mglru_on_cluster(args)
1272
+ err_code = disable_mglru_on_cluster()
1259
1273
  if err_code > 0:
1260
1274
  xpk_exit(err_code)