xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. xpk/commands/batch.py +9 -2
  2. xpk/commands/cluster.py +128 -115
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +10 -28
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +21 -10
  8. xpk/commands/job.py +25 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +21 -0
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +43 -22
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +91 -194
  20. xpk/core/cluster_private.py +6 -11
  21. xpk/core/commands.py +11 -18
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +3 -4
  24. xpk/core/gcloud_context.py +26 -2
  25. xpk/core/gcloud_context_test.py +96 -0
  26. xpk/core/gcluster_manager.py +0 -3
  27. xpk/core/jobset.py +4 -7
  28. xpk/core/kjob.py +14 -27
  29. xpk/core/kueue_manager.py +383 -0
  30. xpk/core/kueue_manager_test.py +542 -0
  31. xpk/core/monitoring.py +1 -1
  32. xpk/core/nap.py +10 -15
  33. xpk/core/network.py +17 -18
  34. xpk/core/nodepool.py +66 -77
  35. xpk/core/nodepool_test.py +198 -1
  36. xpk/core/pathways.py +5 -5
  37. xpk/core/ray.py +10 -14
  38. xpk/core/resources.py +6 -11
  39. xpk/core/scheduling.py +19 -1
  40. xpk/core/scheduling_test.py +31 -0
  41. xpk/core/system_characteristics.py +335 -229
  42. xpk/core/vertex.py +1 -1
  43. xpk/core/workload.py +7 -8
  44. xpk/main.py +2 -4
  45. xpk/parser/cluster.py +7 -0
  46. xpk/parser/cluster_test.py +66 -0
  47. xpk/parser/common.py +11 -0
  48. xpk/parser/workload.py +62 -25
  49. xpk/parser/workload_test.py +82 -0
  50. xpk/utils/feature_flags.py +28 -0
  51. xpk/utils/kueue.py +20 -0
  52. xpk/utils/templates.py +2 -0
  53. xpk/utils/topology.py +37 -0
  54. xpk/utils/topology_test.py +43 -0
  55. xpk/utils/validation.py +79 -55
  56. xpk/utils/validation_test.py +37 -0
  57. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  58. xpk-0.14.0.dist-info/RECORD +112 -0
  59. xpk/core/kueue.py +0 -561
  60. xpk-0.13.0.dist-info/RECORD +0 -101
  61. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  64. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -41,17 +41,12 @@ from ..core.gcloud_context import (
41
41
  add_zone_and_project,
42
42
  get_gke_control_plane_version,
43
43
  get_gke_server_config,
44
+ get_cluster_location,
44
45
  zone_to_region,
45
46
  )
46
47
  from ..core.jobset import update_jobset_resources_if_necessary
47
48
  from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
48
- from ..core.kueue import (
49
- cluster_preheat_yml,
50
- install_kueue_crs,
51
- install_kueue_on_cluster,
52
- wait_for_kueue_available,
53
- update_kueue_resources_if_necessary,
54
- )
49
+ from ..core.kueue_manager import (KueueConfig, KueueManager)
55
50
  from ..core.nap import enable_autoprovisioning_on_cluster
56
51
  from ..core.network import (
57
52
  create_cluster_network_config,
@@ -65,6 +60,7 @@ from ..core.nodepool import (
65
60
  from ..core.ray import install_ray_cluster
66
61
  from ..core.mtc import install_mtc_on_cluster
67
62
  from ..core.resources import create_cluster_configmaps
63
+ from ..core.scheduling import get_total_chips_requested_from_args
68
64
  from ..core.storage import install_storage_crd
69
65
  from ..core.system_characteristics import (
70
66
  AcceleratorType,
@@ -77,11 +73,16 @@ from ..core.workload import get_workload_list
77
73
  from ..utils.console import get_user_input, xpk_exit, xpk_print
78
74
  from ..utils.file import write_tmp_file
79
75
  from ..utils.execution_context import is_dry_run
76
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
80
77
  from . import cluster_gcluster
81
78
  from .common import set_cluster_command
79
+ from jinja2 import Environment, FileSystemLoader
80
+ from ..utils.templates import TEMPLATE_PATH
82
81
  import shutil
83
82
  import os
84
83
 
84
+ CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
85
+
85
86
 
86
87
  def cluster_adapt(args) -> None:
87
88
  """Function that performs cluster adaptation.
@@ -89,6 +90,12 @@ def cluster_adapt(args) -> None:
89
90
  Args:
90
91
  args: user provided arguments for running the command.
91
92
  """
93
+ if should_validate_dependencies(args):
94
+ validate_dependencies_list([
95
+ SystemDependency.KUBECTL,
96
+ SystemDependency.KJOB,
97
+ SystemDependency.GCLOUD,
98
+ ])
92
99
  args.enable_pathways = False
93
100
 
94
101
  system, return_code = get_system_characteristics(args)
@@ -109,7 +116,7 @@ def cluster_adapt(args) -> None:
109
116
  'Argument --num-nodes was not provided, trying to determine number of'
110
117
  ' nodes based on the available nodes in the cluster...'
111
118
  )
112
- args.num_nodes = count_nodes_on_cluster(args, system)
119
+ args.num_nodes = count_nodes_on_cluster(system)
113
120
  if args.num_nodes == 0:
114
121
  xpk_print(
115
122
  'Found unexpected number of nodes. Is the --device-type correct?'
@@ -176,7 +183,7 @@ def cluster_adapt(args) -> None:
176
183
 
177
184
  install_kjob(args)
178
185
  if system.accelerator_type == AcceleratorType['GPU']:
179
- prepare_gpus(args, system)
186
+ prepare_gpus(system)
180
187
 
181
188
  if args.enable_ray_cluster:
182
189
  return_code = install_ray_cluster(args, system)
@@ -188,7 +195,7 @@ def cluster_adapt(args) -> None:
188
195
  xpk_print(
189
196
  'See your GKE Cluster here:'
190
197
  # pylint: disable=line-too-long
191
- f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
198
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
192
199
  )
193
200
  xpk_exit(0)
194
201
 
@@ -199,6 +206,12 @@ def cluster_create(args) -> None:
199
206
  Args:
200
207
  args: user provided arguments for running the command.
201
208
  """
209
+ if should_validate_dependencies(args):
210
+ validate_dependencies_list([
211
+ SystemDependency.KUBECTL,
212
+ SystemDependency.KJOB,
213
+ SystemDependency.GCLOUD,
214
+ ])
202
215
  system, return_code = get_system_characteristics(args)
203
216
 
204
217
  if return_code > 0 or system is None:
@@ -249,7 +262,7 @@ def cluster_create(args) -> None:
249
262
 
250
263
  get_cluster_credentials(args)
251
264
 
252
- update_coredns_command_code = update_coredns_if_necessary(args)
265
+ update_coredns_command_code = update_coredns_if_necessary()
253
266
  if update_coredns_command_code != 0:
254
267
  xpk_exit(update_cluster_command_code)
255
268
 
@@ -317,7 +330,7 @@ def cluster_create(args) -> None:
317
330
  set_jobset_on_cluster_code = set_jobset_on_cluster(args)
318
331
  if set_jobset_on_cluster_code != 0:
319
332
  xpk_exit(set_jobset_on_cluster_code)
320
- update_jobset_resources_code = update_jobset_resources_if_necessary(args)
333
+ update_jobset_resources_code = update_jobset_resources_if_necessary()
321
334
  if update_jobset_resources_code != 0:
322
335
  xpk_exit(update_jobset_resources_code)
323
336
 
@@ -330,7 +343,7 @@ def cluster_create(args) -> None:
330
343
  install_kjob(args)
331
344
 
332
345
  if system.accelerator_type == AcceleratorType['GPU']:
333
- prepare_gpus(args, system)
346
+ prepare_gpus(system)
334
347
 
335
348
  if args.enable_ray_cluster:
336
349
  return_code = install_ray_cluster(args, system)
@@ -348,7 +361,7 @@ def cluster_create(args) -> None:
348
361
  xpk_print(
349
362
  'See your GKE Cluster here:'
350
363
  # pylint: disable=line-too-long
351
- f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
364
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
352
365
  )
353
366
  xpk_exit(0)
354
367
 
@@ -362,6 +375,8 @@ def cluster_delete(args) -> None:
362
375
  Returns:
363
376
  0 if successful and 1 otherwise.
364
377
  """
378
+ if should_validate_dependencies(args):
379
+ validate_dependencies_list([SystemDependency.GCLOUD])
365
380
  xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
366
381
  add_zone_and_project(args)
367
382
 
@@ -391,6 +406,10 @@ def cluster_cacheimage(args) -> None:
391
406
  Returns:
392
407
  0 if successful and 1 otherwise.
393
408
  """
409
+ if should_validate_dependencies(args):
410
+ validate_dependencies_list(
411
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
412
+ )
394
413
  xpk_print(
395
414
  f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
396
415
  )
@@ -406,25 +425,26 @@ def cluster_cacheimage(args) -> None:
406
425
  node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
407
426
  system.accelerator_type
408
427
  ].accelerator_label
409
- yml_string = cluster_preheat_yml.format(
428
+
429
+ template_env = Environment(loader=FileSystemLoader(TEMPLATE_PATH))
430
+ cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
431
+ rendered_yaml = cluster_preheat_yaml.render(
410
432
  cachekey=args.cache_key,
411
433
  image_name=args.docker_image,
412
434
  nodeSelectorKey=node_selector_key,
413
435
  )
414
- tmp = write_tmp_file(yml_string)
436
+ tmp = write_tmp_file(rendered_yaml)
415
437
  command_apply = f'kubectl apply -f {str(tmp)}'
416
438
  command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
417
439
 
418
440
  return_code = run_command_with_updates(
419
- command_delete, 'Deleting Cached Image', args
441
+ command_delete, 'Deleting Cached Image'
420
442
  )
421
443
  if return_code != 0:
422
444
  xpk_print(f'Delete Cached Image returned ERROR {return_code}')
423
445
  xpk_exit(return_code)
424
446
 
425
- return_code = run_command_with_updates(
426
- command_apply, 'Creating Cached Image', args
427
- )
447
+ return_code = run_command_with_updates(command_apply, 'Creating Cached Image')
428
448
  if return_code != 0:
429
449
  xpk_print(f'Create Cached Image returned ERROR {return_code}')
430
450
  xpk_exit(return_code)
@@ -440,12 +460,16 @@ def cluster_describe(args) -> None:
440
460
  Returns:
441
461
  0 if successful and 1 otherwise.
442
462
  """
463
+ if should_validate_dependencies(args):
464
+ validate_dependencies_list(
465
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
466
+ )
443
467
  xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
444
468
  add_zone_and_project(args)
445
469
 
446
470
  get_cluster_credentials(args)
447
471
 
448
- return_code, data_table = nodepools_build_table(args)
472
+ return_code, data_table = nodepools_build_table()
449
473
  if return_code != 0:
450
474
  xpk_exit(return_code)
451
475
 
@@ -461,7 +485,6 @@ def cluster_describe(args) -> None:
461
485
  r'kubectl get node --no-headers=true'
462
486
  r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
463
487
  'Count TPU Nodes',
464
- args,
465
488
  )
466
489
  if return_code_node_output != 0:
467
490
  xpk_exit(return_code_node_output)
@@ -472,7 +495,6 @@ def cluster_describe(args) -> None:
472
495
  "kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
473
496
  ' Running | wc -l',
474
497
  'Count TPU Pods',
475
- args,
476
498
  )
477
499
  if return_code_pod_output != 0:
478
500
  xpk_exit(return_code_pod_output)
@@ -487,7 +509,7 @@ def cluster_describe(args) -> None:
487
509
  xpk_exit(0)
488
510
 
489
511
 
490
- def nodepools_build_table(args) -> tuple[int, list[list]]:
512
+ def nodepools_build_table() -> tuple[int, list[list]]:
491
513
  table = [[
492
514
  'NODEPOOL_NAME',
493
515
  'SLICE',
@@ -499,14 +521,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
499
521
 
500
522
  nodepools_data = {}
501
523
 
502
- nodepools, return_code = get_node_pools_name(args)
524
+ nodepools, return_code = get_node_pools_name()
503
525
  if return_code != 0:
504
526
  xpk_print(f'Get node pools name returned ERROR {return_code}')
505
527
 
506
528
  for name in nodepools:
507
529
  nodepools_data[name] = [name]
508
530
 
509
- slices, return_code = get_slice_node_pool_size(args)
531
+ slices, return_code = get_slice_node_pool_size()
510
532
  if return_code != 0:
511
533
  xpk_print(f'Get slice node pool size returned ERROR {return_code}')
512
534
 
@@ -515,7 +537,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
515
537
  count, nodepool_name = s[0], s[1]
516
538
  nodepools_data[nodepool_name].append(count)
517
539
 
518
- type_nodepool, return_code = get_node_pool_instance_type(args)
540
+ type_nodepool, return_code = get_node_pool_instance_type()
519
541
  if return_code != 0:
520
542
  xpk_print(f'Get node pool instance type returned ERROR {return_code}')
521
543
 
@@ -524,7 +546,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
524
546
  nodepool_name, instance_type = tn[0], tn[1]
525
547
  nodepools_data[nodepool_name].append(instance_type)
526
548
 
527
- expected_healthy_nodes, return_code = get_expected_healthy_nodes(args)
549
+ expected_healthy_nodes, return_code = get_expected_healthy_nodes()
528
550
  if return_code != 0:
529
551
  xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
530
552
 
@@ -533,7 +555,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
533
555
  count, nodepool_name = ehn[0], ehn[1]
534
556
  nodepools_data[nodepool_name].append(count)
535
557
 
536
- actual_healthy_nodes, return_code = get_actual_healthy_nodes(args)
558
+ actual_healthy_nodes, return_code = get_actual_healthy_nodes()
537
559
  if return_code != 0:
538
560
  xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
539
561
 
@@ -542,7 +564,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
542
564
  count, nodepool_name = ahn[0], ahn[1]
543
565
  nodepools_data[nodepool_name].append(count)
544
566
 
545
- total_nodes, return_code = get_total_nodes_per_node_pool(args)
567
+ total_nodes, return_code = get_total_nodes_per_node_pool()
546
568
  if return_code != 0:
547
569
  xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
548
570
 
@@ -557,20 +579,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
557
579
  return 0, table
558
580
 
559
581
 
560
- def get_node_pools_name(args) -> tuple[list[str], int]:
582
+ def get_node_pools_name() -> tuple[list[str], int]:
561
583
  cmd_nodepools = (
562
584
  'kubectl get node --no-headers=true -o'
563
585
  " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
564
586
  " | grep -v 'none' | sort | uniq"
565
587
  )
566
- return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args)
588
+ return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list')
567
589
  if return_code != 0:
568
590
  return [], return_code
569
591
 
570
592
  return out.splitlines(), 0
571
593
 
572
594
 
573
- def get_slice_node_pool_size(args) -> tuple[list[str], int]:
595
+ def get_slice_node_pool_size() -> tuple[list[str], int]:
574
596
  cmd_slices = (
575
597
  'kubectl get node --no-headers=true -o'
576
598
  " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
@@ -579,7 +601,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
579
601
  ' | uniq -c'
580
602
  )
581
603
  return_code, out = run_command_for_value(
582
- cmd_slices, 'Count nodes per nodepool slice', args
604
+ cmd_slices, 'Count nodes per nodepool slice'
583
605
  )
584
606
  if return_code != 0:
585
607
  return [], return_code
@@ -587,7 +609,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
587
609
  return out.splitlines(), 0
588
610
 
589
611
 
590
- def get_node_pool_instance_type(args) -> tuple[list[str], int]:
612
+ def get_node_pool_instance_type() -> tuple[list[str], int]:
591
613
  cmd_type_nodepool = (
592
614
  'kubectl get node --no-headers=true -o'
593
615
  " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
@@ -595,7 +617,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
595
617
  " 'none' | sort | uniq"
596
618
  )
597
619
  return_code, out = run_command_for_value(
598
- cmd_type_nodepool, 'Instance type of nodepools', args
620
+ cmd_type_nodepool, 'Instance type of nodepools'
599
621
  )
600
622
  if return_code != 0:
601
623
  return [], return_code
@@ -603,7 +625,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
603
625
  return out.splitlines(), 0
604
626
 
605
627
 
606
- def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
628
+ def get_expected_healthy_nodes() -> tuple[list[str], int]:
607
629
  cmd_expected_healthy_nodes = (
608
630
  'kubectl get node --no-headers=true -o'
609
631
  " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
@@ -614,7 +636,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
614
636
  return_code, out = run_command_for_value(
615
637
  cmd_expected_healthy_nodes,
616
638
  'Count expected healthy nodes per nodepool',
617
- args,
618
639
  )
619
640
  if return_code != 0:
620
641
  return [], return_code
@@ -622,7 +643,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
622
643
  return out.splitlines(), 0
623
644
 
624
645
 
625
- def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
646
+ def get_actual_healthy_nodes() -> tuple[list[str], int]:
626
647
  cmd_actual_healthy_nodes = (
627
648
  'kubectl get node --no-headers=true -o'
628
649
  " custom-columns='NODE_NAME:metadata.name,"
@@ -635,7 +656,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
635
656
  ' | uniq -c'
636
657
  )
637
658
  return_code, out = run_command_for_value(
638
- cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args
659
+ cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
639
660
  )
640
661
  if return_code != 0:
641
662
  return [], return_code
@@ -643,7 +664,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
643
664
  return out.splitlines(), 0
644
665
 
645
666
 
646
- def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
667
+ def get_total_nodes_per_node_pool() -> tuple[list[str], int]:
647
668
  cmd_total_nodes = (
648
669
  'kubectl get node --no-headers=true -o'
649
670
  " custom-columns='NODE_NAME:metadata.name,"
@@ -655,7 +676,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
655
676
  ' | uniq -c'
656
677
  )
657
678
  return_code, out = run_command_for_value(
658
- cmd_total_nodes, 'Count total nodes per nodepool', args
679
+ cmd_total_nodes, 'Count total nodes per nodepool'
659
680
  )
660
681
  if return_code != 0:
661
682
  return [], return_code
@@ -672,6 +693,8 @@ def cluster_list(args) -> None:
672
693
  Returns:
673
694
  0 if successful and 1 otherwise.
674
695
  """
696
+ if should_validate_dependencies(args):
697
+ validate_dependencies_list([SystemDependency.GCLOUD])
675
698
  add_zone_and_project(args)
676
699
  xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
677
700
  if run_gke_clusters_list_command(args):
@@ -707,20 +730,20 @@ def cluster_create_ray_cluster(args) -> None:
707
730
  cluster_create(args)
708
731
 
709
732
 
710
- def install_jq(args):
733
+ def install_jq():
711
734
  """Installs 'jq' utility."""
712
735
  if shutil.which('jq'):
713
736
  xpk_print("Task: 'Install jq' skipped, jq already installed.")
714
737
  return
715
738
  command_jq_install = 'sudo apt install jq -y'
716
739
  xpk_print("Task: 'Install jq' in progress.")
717
- return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
740
+ return_code = run_command_with_updates(command_jq_install, 'Install jq')
718
741
  if return_code != 0:
719
742
  xpk_print(f'Install jq error {return_code}')
720
743
  xpk_exit(return_code)
721
744
 
722
745
 
723
- def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
746
+ def clone_coredns_deployment_repo(coredns_repo_full_path: str):
724
747
  """Clones the CoreDNS deployment repository if it doesn't exist."""
725
748
  if os.path.exists(coredns_repo_full_path):
726
749
  xpk_print(
@@ -735,15 +758,13 @@ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
735
758
  "Task: 'Clone deployment' in progress, Target"
736
759
  f' directory:{coredns_repo_full_path}.'
737
760
  )
738
- return_code = run_command_with_updates(
739
- command_git_clone, 'Clone deployment', args
740
- )
761
+ return_code = run_command_with_updates(command_git_clone, 'Clone deployment')
741
762
  if return_code != 0:
742
763
  xpk_print(f'Clone deployment error {return_code}')
743
764
  xpk_exit(return_code)
744
765
 
745
766
 
746
- def deploy_coredns_manifests(args, coredns_k8s_path: str):
767
+ def deploy_coredns_manifests(coredns_k8s_path: str):
747
768
  """Deploys CoreDNS manifests to the cluster."""
748
769
  if not os.path.isdir(coredns_k8s_path):
749
770
  xpk_print(
@@ -761,7 +782,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
761
782
  f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
762
783
  )
763
784
  return_code = run_command_with_updates(
764
- command_deploy_coredns, 'Deploy CoreDNS', args
785
+ command_deploy_coredns, 'Deploy CoreDNS'
765
786
  )
766
787
  if return_code != 0:
767
788
  xpk_print(f'Deploy CoreDNS error {return_code}')
@@ -773,9 +794,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
773
794
  xpk_exit(return_code)
774
795
 
775
796
 
776
- def scale_down_deployment(
777
- args, deployment_name: str, namespace: str = 'kube-system'
778
- ):
797
+ def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
779
798
  """Scales down a specified Kubernetes deployment to 0 replicas."""
780
799
  command = (
781
800
  f'kubectl scale deployment {deployment_name} --replicas=0'
@@ -783,29 +802,27 @@ def scale_down_deployment(
783
802
  )
784
803
  xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
785
804
  return_code = run_command_with_updates(
786
- command, f'Scale down {deployment_name}', args
805
+ command, f'Scale down {deployment_name}'
787
806
  )
788
807
  if return_code != 0:
789
808
  xpk_print(f'Scale down {deployment_name} error {return_code}')
790
809
  xpk_exit(return_code)
791
- xpk_print(f'\n{deployment_name} has been scaled down.')
810
+ xpk_print(f'{deployment_name} has been scaled down.')
792
811
 
793
812
 
794
- def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
813
+ def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
795
814
  """Scales up the CoreDNS deployment to a specified number of replicas."""
796
815
  command_coredns_scale = (
797
816
  f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
798
817
  )
799
818
  xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
800
- return_code = run_command_with_updates(
801
- command_coredns_scale, 'Scale CoreDNS', args
802
- )
819
+ return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS')
803
820
  if return_code != 0:
804
821
  xpk_print(f'Scale CoreDNS error {return_code}')
805
822
  xpk_exit(return_code)
806
823
 
807
824
 
808
- def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
825
+ def check_deployment_exists(deployment_name: str, namespace: str) -> bool:
809
826
  """Check for the existence of a specific Deployment in a given namespace."""
810
827
  # TODO: rewrite this to be more obvious, check if it is correct
811
828
  command = (
@@ -813,17 +830,17 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
813
830
  f' {namespace} --ignore-not-found'
814
831
  )
815
832
  result = run_command_with_updates(
816
- command, 'Waiting for kubeDNS to be checked.', args
833
+ command, 'Waiting for kubeDNS to be checked.'
817
834
  )
818
835
  return result != 0
819
836
 
820
837
 
821
838
  def verify_coredns_readiness(
822
- args, timeout: int = 240, namespace: str = 'kube-system'
839
+ timeout: int = 240, namespace: str = 'kube-system'
823
840
  ):
824
841
  """Verifies CoreDNS readiness using kubectl wait commands."""
825
842
  xpk_print('Now verifying CoreDNS readiness...')
826
- kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
843
+ kube_dns_exists = check_deployment_exists('kube-dns', namespace)
827
844
  if kube_dns_exists:
828
845
  # Wait for kube-dns to be fully scaled down
829
846
  command_kube_dns_wait_scaled_down = (
@@ -833,7 +850,7 @@ def verify_coredns_readiness(
833
850
  )
834
851
  xpk_print('Verifying if kube-dns has scaled down...')
835
852
  return_code_kube_dns = run_command_with_updates(
836
- command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
853
+ command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
837
854
  )
838
855
  if return_code_kube_dns != 0:
839
856
  xpk_print('kube-dns did not scale down successfully within the timeout.')
@@ -849,7 +866,7 @@ def verify_coredns_readiness(
849
866
  )
850
867
  xpk_print('Verifying if CoreDNS is available...')
851
868
  return_code_coredns = run_command_with_updates(
852
- command_coredns_wait_available, 'Wait for coredns available', args
869
+ command_coredns_wait_available, 'Wait for coredns available'
853
870
  )
854
871
  if return_code_coredns != 0:
855
872
  xpk_print(
@@ -874,12 +891,9 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
874
891
  xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
875
892
 
876
893
 
877
- def update_coredns(args) -> int:
894
+ def update_coredns() -> int:
878
895
  """Updates and deploys CoreDNS within a cluster.
879
896
 
880
- Args:
881
- args: user provided arguments for running the command.
882
-
883
897
  Returns:
884
898
  0 if successful and 1 otherwise.
885
899
  """
@@ -888,23 +902,23 @@ def update_coredns(args) -> int:
888
902
  coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
889
903
  coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
890
904
  # 1. Install jq
891
- install_jq(args)
905
+ install_jq()
892
906
 
893
907
  # 2. Clone CoreDNS deployment repository
894
- clone_coredns_deployment_repo(args, coredns_repo_full_path)
908
+ clone_coredns_deployment_repo(coredns_repo_full_path)
895
909
 
896
910
  # 3. Deploy CoreDNS to the cluster
897
- deploy_coredns_manifests(args, coredns_k8s_path)
911
+ deploy_coredns_manifests(coredns_k8s_path)
898
912
 
899
913
  # 4. Scale down kube-dns-autoscaler
900
- scale_down_deployment(args, 'kube-dns-autoscaler')
914
+ scale_down_deployment('kube-dns-autoscaler')
901
915
 
902
916
  # 5. Scale down kube-dns
903
- scale_down_deployment(args, 'kube-dns')
917
+ scale_down_deployment('kube-dns')
904
918
 
905
919
  # 6. Scale up coredns and verify readiness
906
- scale_up_coredns(args, replicas=15)
907
- verify_coredns_readiness(args, timeout=120)
920
+ scale_up_coredns(replicas=15)
921
+ verify_coredns_readiness(timeout=120)
908
922
 
909
923
  xpk_print('The CoreDNS setup process has been completed.')
910
924
 
@@ -914,7 +928,7 @@ def update_coredns(args) -> int:
914
928
  return 0
915
929
 
916
930
 
917
- def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
931
+ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
918
932
  """Checks if the CoreDNS deployment exists in the given namespace.
919
933
 
920
934
  Args:
@@ -929,10 +943,10 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
929
943
  f' namespace: {namespace}'
930
944
  )
931
945
  return_code = run_command_with_updates(
932
- command, f'Check CoreDNS deployment in {namespace}', args
946
+ command, f'Check CoreDNS deployment in {namespace}'
933
947
  )
934
948
  if return_code == 0:
935
- verify_coredns_readiness(args)
949
+ verify_coredns_readiness()
936
950
  xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
937
951
  return True
938
952
  else:
@@ -943,25 +957,22 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
943
957
  return False
944
958
 
945
959
 
946
- def update_coredns_if_necessary(args) -> int:
960
+ def update_coredns_if_necessary() -> int:
947
961
  """Updates and deploys CoreDNS within the cluster if it's not already present.
948
962
 
949
963
  This function checks for the existence of the CoreDNS deployment.
950
964
  If it's not found, it proceeds to deploy and configure CoreDNS.
951
965
 
952
- Args:
953
- args: User-provided arguments for running the command.
954
-
955
966
  Returns:
956
967
  0 if successful (CoreDNS was already present or successfully deployed),
957
968
  and 1 otherwise.
958
969
  """
959
- if coredns_deployment_exists(args, namespace='kube-system'):
970
+ if coredns_deployment_exists(namespace='kube-system'):
960
971
  xpk_print('Skipping CoreDNS deployment since it already exists.')
961
972
  return 0
962
973
  else:
963
974
  xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
964
- return update_coredns(args)
975
+ return update_coredns()
965
976
 
966
977
 
967
978
  def create_cluster_if_necessary(
@@ -1021,10 +1032,10 @@ def run_gke_cluster_delete_command(args) -> int:
1021
1032
  command = (
1022
1033
  'gcloud beta container clusters delete'
1023
1034
  f' {args.cluster} --project={args.project}'
1024
- f' --region={zone_to_region(args.zone)} --quiet'
1035
+ f' --location={get_cluster_location(args.project, args.cluster, args.zone)} --quiet'
1025
1036
  )
1026
1037
 
1027
- return_code = run_command_with_updates(command, 'Cluster Delete', args)
1038
+ return_code = run_command_with_updates(command, 'Cluster Delete')
1028
1039
  if return_code != 0:
1029
1040
  xpk_print(f'Cluster delete request returned ERROR {return_code}')
1030
1041
  return 1
@@ -1047,9 +1058,9 @@ def run_gke_clusters_list_command(args) -> int:
1047
1058
  """
1048
1059
  command = (
1049
1060
  'gcloud container clusters list'
1050
- f' --project={args.project} --region={zone_to_region(args.zone)}'
1061
+ f' --project={args.project} --filter=location~"{zone_to_region(args.zone)}.*"'
1051
1062
  )
1052
- return_code = run_command_with_updates(command, 'Cluster List', args)
1063
+ return_code = run_command_with_updates(command, 'Cluster List')
1053
1064
  if return_code != 0:
1054
1065
  xpk_print(f'Cluster list request returned ERROR {return_code}')
1055
1066
  return 1
@@ -1105,6 +1116,7 @@ def run_gke_cluster_create_command(
1105
1116
  f' {rapid_release_cmd}'
1106
1117
  ' --enable-dns-access'
1107
1118
  ' --autoscaling-profile=optimize-utilization'
1119
+ ' --labels=gke_product_type=xpk'
1108
1120
  )
1109
1121
 
1110
1122
  enable_ip_alias = False
@@ -1158,7 +1170,7 @@ def run_gke_cluster_create_command(
1158
1170
  addons_str = ','.join(addons)
1159
1171
  command += f' --addons={addons_str}'
1160
1172
 
1161
- return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
1173
+ return_code = run_command_with_updates(command, 'GKE Cluster Create')
1162
1174
  if return_code != 0:
1163
1175
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
1164
1176
  return 1
@@ -1204,12 +1216,12 @@ def install_storage_csis(args):
1204
1216
 
1205
1217
  def install_kjob(args):
1206
1218
  xpk_print('Verifying kjob installation')
1207
- err_code = verify_kjob_installed(args)
1219
+ err_code = verify_kjob_installed()
1208
1220
  if err_code > 0:
1209
1221
  xpk_exit(err_code)
1210
1222
 
1211
1223
  xpk_print('Applying kjob CDRs')
1212
- err_code = apply_kjob_crds(args)
1224
+ err_code = apply_kjob_crds()
1213
1225
  if err_code > 0:
1214
1226
  xpk_exit(err_code)
1215
1227
 
@@ -1220,42 +1232,43 @@ def install_kjob(args):
1220
1232
 
1221
1233
  def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
1222
1234
  xpk_print('Enabling Kueue on the cluster')
1223
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
1224
- if install_kueue_on_cluster_code != 0:
1225
- xpk_exit(install_kueue_on_cluster_code)
1226
-
1227
- xpk_print('Wait for Kueue to be fully available')
1228
- wait_for_kueue_available_code = wait_for_kueue_available(args)
1229
- if wait_for_kueue_available_code != 0:
1230
- xpk_exit(wait_for_kueue_available_code)
1231
-
1232
- xpk_print('Install Kueue Custom Resources')
1233
- enable_kueue_credentials_code = install_kueue_crs(
1234
- args, system, autoprovisioning_config
1235
+ autoprovisioning_enabled = False
1236
+ if autoprovisioning_config:
1237
+ # Determine total resources available based on autoprovisioning max chips.
1238
+ autoprovisioning_enabled = True
1239
+ total_chips = autoprovisioning_config.maximum_chips
1240
+ else:
1241
+ # Determine total chips based on user specified topology.
1242
+ total_chips = get_total_chips_requested_from_args(args, system)
1243
+ kueue_manager = KueueManager()
1244
+ kueue_manager.install_or_upgrade(
1245
+ KueueConfig(
1246
+ system,
1247
+ total_chips=total_chips,
1248
+ autoprovisioning_enabled=autoprovisioning_enabled,
1249
+ num_slices=args.num_slices,
1250
+ flex=args.flex,
1251
+ memory_limit=args.memory_limit,
1252
+ cpu_limit=args.cpu_limit,
1253
+ is_pathways_cluster=args.enable_pathways,
1254
+ ),
1235
1255
  )
1236
- if enable_kueue_credentials_code != 0:
1237
- xpk_exit(enable_kueue_credentials_code)
1238
-
1239
- xpk_print('Update Kueue Controller Manager resources')
1240
- update_kueue_resources_code = update_kueue_resources_if_necessary(args)
1241
- if update_kueue_resources_code != 0:
1242
- xpk_exit(update_kueue_resources_code)
1243
1256
 
1244
1257
 
1245
- def prepare_gpus(args, system: SystemCharacteristics):
1258
+ def prepare_gpus(system: SystemCharacteristics):
1246
1259
  xpk_print('Installing NCCL Plugin for cluster')
1247
- install_nccl_code = install_nccl_on_cluster(args, system)
1260
+ install_nccl_code = install_nccl_on_cluster(system)
1248
1261
  if install_nccl_code != 0:
1249
1262
  xpk_exit(install_nccl_code)
1250
1263
 
1251
1264
  if system.device_type == H100_DEVICE_TYPE:
1252
1265
  xpk_print('Installing NRI device injector for cluster')
1253
- install_nri_code = install_nri_on_cluster(args)
1266
+ install_nri_code = install_nri_on_cluster()
1254
1267
  if install_nri_code != 0:
1255
1268
  xpk_exit(install_nri_code)
1256
1269
 
1257
1270
  if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
1258
1271
  xpk_print('Disabling MGLRU')
1259
- err_code = disable_mglru_on_cluster(args)
1272
+ err_code = disable_mglru_on_cluster()
1260
1273
  if err_code > 0:
1261
1274
  xpk_exit(err_code)