xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. integration/__init__.py +15 -0
  2. integration/docker_manager_test.py +102 -0
  3. integration/gcluster_a3mega_test.py +204 -0
  4. integration/gcluster_a3ultra_test.py +176 -0
  5. integration/gcluster_a4_test.py +176 -0
  6. integration/gcluster_test.py +107 -0
  7. xpk/commands/batch.py +9 -2
  8. xpk/commands/cluster.py +143 -117
  9. xpk/commands/cluster_gcluster.py +81 -14
  10. xpk/commands/cluster_gcluster_test.py +177 -0
  11. xpk/commands/cluster_test.py +92 -0
  12. xpk/commands/common.py +14 -26
  13. xpk/commands/info.py +11 -9
  14. xpk/commands/inspector.py +21 -10
  15. xpk/commands/job.py +25 -9
  16. xpk/commands/kind.py +39 -40
  17. xpk/commands/kjob_common.py +4 -4
  18. xpk/commands/run.py +9 -2
  19. xpk/commands/shell.py +13 -10
  20. xpk/commands/storage.py +21 -0
  21. xpk/commands/version.py +0 -4
  22. xpk/commands/workload.py +84 -29
  23. xpk/commands/workload_test.py +81 -0
  24. xpk/core/blueprint/blueprint_generator.py +4 -40
  25. xpk/core/blueprint/blueprint_test.py +0 -6
  26. xpk/core/blueprint/testing/__init__.py +15 -0
  27. xpk/core/capacity.py +6 -5
  28. xpk/core/cluster.py +91 -194
  29. xpk/core/cluster_private.py +6 -11
  30. xpk/core/commands.py +11 -18
  31. xpk/core/config.py +1 -1
  32. xpk/core/docker_image.py +3 -4
  33. xpk/core/gcloud_context.py +26 -2
  34. xpk/core/gcloud_context_test.py +96 -0
  35. xpk/core/gcluster_manager.py +0 -3
  36. xpk/core/jobset.py +4 -7
  37. xpk/core/kjob.py +14 -27
  38. xpk/core/kueue_manager.py +423 -0
  39. xpk/core/kueue_manager_test.py +574 -0
  40. xpk/core/monitoring.py +1 -1
  41. xpk/core/nap.py +10 -15
  42. xpk/core/network.py +17 -18
  43. xpk/core/nodepool.py +66 -77
  44. xpk/core/nodepool_test.py +198 -1
  45. xpk/core/pathways.py +5 -5
  46. xpk/core/ray.py +10 -14
  47. xpk/core/resources.py +6 -11
  48. xpk/core/scheduling.py +19 -1
  49. xpk/core/scheduling_test.py +31 -0
  50. xpk/core/system_characteristics.py +350 -232
  51. xpk/core/system_characteristics_test.py +73 -0
  52. xpk/core/vertex.py +1 -1
  53. xpk/core/workload.py +7 -8
  54. xpk/main.py +2 -4
  55. xpk/parser/cluster.py +7 -0
  56. xpk/parser/cluster_test.py +66 -0
  57. xpk/parser/common.py +11 -0
  58. xpk/parser/workload.py +62 -25
  59. xpk/parser/workload_test.py +82 -0
  60. xpk/templates/cluster_preheat.yaml.j2 +31 -0
  61. xpk/templates/filestore-pv.yaml +17 -0
  62. xpk/templates/filestore-pvc.yaml +11 -0
  63. xpk/templates/filestore-sc.yaml +10 -0
  64. xpk/templates/fuse-pv.yaml +17 -0
  65. xpk/templates/fuse-pvc.yaml +13 -0
  66. xpk/templates/kueue_config.yaml.j2 +95 -0
  67. xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
  68. xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
  69. xpk/templates/mtc-cpc.yaml +15 -0
  70. xpk/templates/volume_bundle.yaml +7 -0
  71. xpk/utils/feature_flags.py +28 -0
  72. xpk/utils/kueue.py +20 -0
  73. xpk/utils/templates.py +15 -0
  74. xpk/utils/topology.py +46 -0
  75. xpk/utils/topology_test.py +63 -0
  76. xpk/utils/validation.py +79 -55
  77. xpk/utils/validation_test.py +37 -0
  78. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
  79. xpk-0.14.1.dist-info/RECORD +133 -0
  80. xpk-0.14.1.dist-info/top_level.txt +2 -0
  81. xpk/core/kueue.py +0 -561
  82. xpk-0.13.0.dist-info/RECORD +0 -101
  83. xpk-0.13.0.dist-info/top_level.txt +0 -1
  84. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
  85. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
  86. {xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0
xpk/commands/cluster.py CHANGED
@@ -16,6 +16,7 @@ limitations under the License.
16
16
 
17
17
  from tabulate import tabulate
18
18
 
19
+ from ..utils.feature_flags import FeatureFlags
19
20
  from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
20
21
  from ..core.cluster import (
21
22
  get_all_clusters_programmatic,
@@ -41,17 +42,12 @@ from ..core.gcloud_context import (
41
42
  add_zone_and_project,
42
43
  get_gke_control_plane_version,
43
44
  get_gke_server_config,
45
+ get_cluster_location,
44
46
  zone_to_region,
45
47
  )
46
48
  from ..core.jobset import update_jobset_resources_if_necessary
47
49
  from ..core.kjob import apply_kjob_crds, prepare_kjob, verify_kjob_installed
48
- from ..core.kueue import (
49
- cluster_preheat_yml,
50
- install_kueue_crs,
51
- install_kueue_on_cluster,
52
- wait_for_kueue_available,
53
- update_kueue_resources_if_necessary,
54
- )
50
+ from ..core.kueue_manager import (KueueConfig, KueueManager)
55
51
  from ..core.nap import enable_autoprovisioning_on_cluster
56
52
  from ..core.network import (
57
53
  create_cluster_network_config,
@@ -65,6 +61,7 @@ from ..core.nodepool import (
65
61
  from ..core.ray import install_ray_cluster
66
62
  from ..core.mtc import install_mtc_on_cluster
67
63
  from ..core.resources import create_cluster_configmaps
64
+ from ..core.scheduling import get_total_chips_requested_from_args
68
65
  from ..core.storage import install_storage_crd
69
66
  from ..core.system_characteristics import (
70
67
  AcceleratorType,
@@ -77,11 +74,16 @@ from ..core.workload import get_workload_list
77
74
  from ..utils.console import get_user_input, xpk_exit, xpk_print
78
75
  from ..utils.file import write_tmp_file
79
76
  from ..utils.execution_context import is_dry_run
77
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
80
78
  from . import cluster_gcluster
81
- from .common import set_cluster_command
79
+ from .common import set_cluster_command, validate_sub_slicing_system
80
+ from jinja2 import Environment, FileSystemLoader
81
+ from ..utils.templates import get_templates_absolute_path
82
82
  import shutil
83
83
  import os
84
84
 
85
+ CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
86
+
85
87
 
86
88
  def cluster_adapt(args) -> None:
87
89
  """Function that performs cluster adaptation.
@@ -89,6 +91,12 @@ def cluster_adapt(args) -> None:
89
91
  Args:
90
92
  args: user provided arguments for running the command.
91
93
  """
94
+ if should_validate_dependencies(args):
95
+ validate_dependencies_list([
96
+ SystemDependency.KUBECTL,
97
+ SystemDependency.KJOB,
98
+ SystemDependency.GCLOUD,
99
+ ])
92
100
  args.enable_pathways = False
93
101
 
94
102
  system, return_code = get_system_characteristics(args)
@@ -109,7 +117,7 @@ def cluster_adapt(args) -> None:
109
117
  'Argument --num-nodes was not provided, trying to determine number of'
110
118
  ' nodes based on the available nodes in the cluster...'
111
119
  )
112
- args.num_nodes = count_nodes_on_cluster(args, system)
120
+ args.num_nodes = count_nodes_on_cluster(system)
113
121
  if args.num_nodes == 0:
114
122
  xpk_print(
115
123
  'Found unexpected number of nodes. Is the --device-type correct?'
@@ -176,7 +184,7 @@ def cluster_adapt(args) -> None:
176
184
 
177
185
  install_kjob(args)
178
186
  if system.accelerator_type == AcceleratorType['GPU']:
179
- prepare_gpus(args, system)
187
+ prepare_gpus(system)
180
188
 
181
189
  if args.enable_ray_cluster:
182
190
  return_code = install_ray_cluster(args, system)
@@ -188,23 +196,36 @@ def cluster_adapt(args) -> None:
188
196
  xpk_print(
189
197
  'See your GKE Cluster here:'
190
198
  # pylint: disable=line-too-long
191
- f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
199
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
192
200
  )
193
201
  xpk_exit(0)
194
202
 
195
203
 
204
+ def _validate_cluster_create_args(args, system: SystemCharacteristics):
205
+ if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
206
+ validate_sub_slicing_system(system)
207
+
208
+
196
209
  def cluster_create(args) -> None:
197
210
  """Function around cluster creation.
198
211
 
199
212
  Args:
200
213
  args: user provided arguments for running the command.
201
214
  """
202
- system, return_code = get_system_characteristics(args)
215
+ if should_validate_dependencies(args):
216
+ validate_dependencies_list([
217
+ SystemDependency.KUBECTL,
218
+ SystemDependency.KJOB,
219
+ SystemDependency.GCLOUD,
220
+ ])
203
221
 
222
+ system, return_code = get_system_characteristics(args)
204
223
  if return_code > 0 or system is None:
205
224
  xpk_print('Fetching system characteristics failed!')
206
225
  xpk_exit(return_code)
207
226
 
227
+ _validate_cluster_create_args(args, system)
228
+
208
229
  xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
209
230
  add_zone_and_project(args)
210
231
 
@@ -249,7 +270,7 @@ def cluster_create(args) -> None:
249
270
 
250
271
  get_cluster_credentials(args)
251
272
 
252
- update_coredns_command_code = update_coredns_if_necessary(args)
273
+ update_coredns_command_code = update_coredns_if_necessary()
253
274
  if update_coredns_command_code != 0:
254
275
  xpk_exit(update_cluster_command_code)
255
276
 
@@ -317,7 +338,7 @@ def cluster_create(args) -> None:
317
338
  set_jobset_on_cluster_code = set_jobset_on_cluster(args)
318
339
  if set_jobset_on_cluster_code != 0:
319
340
  xpk_exit(set_jobset_on_cluster_code)
320
- update_jobset_resources_code = update_jobset_resources_if_necessary(args)
341
+ update_jobset_resources_code = update_jobset_resources_if_necessary()
321
342
  if update_jobset_resources_code != 0:
322
343
  xpk_exit(update_jobset_resources_code)
323
344
 
@@ -330,7 +351,7 @@ def cluster_create(args) -> None:
330
351
  install_kjob(args)
331
352
 
332
353
  if system.accelerator_type == AcceleratorType['GPU']:
333
- prepare_gpus(args, system)
354
+ prepare_gpus(system)
334
355
 
335
356
  if args.enable_ray_cluster:
336
357
  return_code = install_ray_cluster(args, system)
@@ -348,7 +369,7 @@ def cluster_create(args) -> None:
348
369
  xpk_print(
349
370
  'See your GKE Cluster here:'
350
371
  # pylint: disable=line-too-long
351
- f' https://console.cloud.google.com/kubernetes/clusters/details/{zone_to_region(args.zone)}/{args.cluster}/details?project={args.project}'
372
+ f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
352
373
  )
353
374
  xpk_exit(0)
354
375
 
@@ -362,6 +383,8 @@ def cluster_delete(args) -> None:
362
383
  Returns:
363
384
  0 if successful and 1 otherwise.
364
385
  """
386
+ if should_validate_dependencies(args):
387
+ validate_dependencies_list([SystemDependency.GCLOUD])
365
388
  xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
366
389
  add_zone_and_project(args)
367
390
 
@@ -391,6 +414,10 @@ def cluster_cacheimage(args) -> None:
391
414
  Returns:
392
415
  0 if successful and 1 otherwise.
393
416
  """
417
+ if should_validate_dependencies(args):
418
+ validate_dependencies_list(
419
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
420
+ )
394
421
  xpk_print(
395
422
  f'Starting cluster cacheimage for cluster: {args.cluster}', flush=True
396
423
  )
@@ -406,25 +433,28 @@ def cluster_cacheimage(args) -> None:
406
433
  node_selector_key = AcceleratorTypeToAcceleratorCharacteristics[
407
434
  system.accelerator_type
408
435
  ].accelerator_label
409
- yml_string = cluster_preheat_yml.format(
436
+
437
+ template_env = Environment(
438
+ loader=FileSystemLoader(searchpath=get_templates_absolute_path())
439
+ )
440
+ cluster_preheat_yaml = template_env.get_template(CLUSTER_PREHEAT_JINJA_FILE)
441
+ rendered_yaml = cluster_preheat_yaml.render(
410
442
  cachekey=args.cache_key,
411
443
  image_name=args.docker_image,
412
444
  nodeSelectorKey=node_selector_key,
413
445
  )
414
- tmp = write_tmp_file(yml_string)
446
+ tmp = write_tmp_file(rendered_yaml)
415
447
  command_apply = f'kubectl apply -f {str(tmp)}'
416
448
  command_delete = f'kubectl delete -f {str(tmp)} --ignore-not-found=true'
417
449
 
418
450
  return_code = run_command_with_updates(
419
- command_delete, 'Deleting Cached Image', args
451
+ command_delete, 'Deleting Cached Image'
420
452
  )
421
453
  if return_code != 0:
422
454
  xpk_print(f'Delete Cached Image returned ERROR {return_code}')
423
455
  xpk_exit(return_code)
424
456
 
425
- return_code = run_command_with_updates(
426
- command_apply, 'Creating Cached Image', args
427
- )
457
+ return_code = run_command_with_updates(command_apply, 'Creating Cached Image')
428
458
  if return_code != 0:
429
459
  xpk_print(f'Create Cached Image returned ERROR {return_code}')
430
460
  xpk_exit(return_code)
@@ -440,12 +470,16 @@ def cluster_describe(args) -> None:
440
470
  Returns:
441
471
  0 if successful and 1 otherwise.
442
472
  """
473
+ if should_validate_dependencies(args):
474
+ validate_dependencies_list(
475
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
476
+ )
443
477
  xpk_print(f'Starting nodepool list for cluster: {args.cluster}', flush=True)
444
478
  add_zone_and_project(args)
445
479
 
446
480
  get_cluster_credentials(args)
447
481
 
448
- return_code, data_table = nodepools_build_table(args)
482
+ return_code, data_table = nodepools_build_table()
449
483
  if return_code != 0:
450
484
  xpk_exit(return_code)
451
485
 
@@ -461,7 +495,6 @@ def cluster_describe(args) -> None:
461
495
  r'kubectl get node --no-headers=true'
462
496
  r" --selector='cloud.google.com/gke-tpu-accelerator' | wc -l",
463
497
  'Count TPU Nodes',
464
- args,
465
498
  )
466
499
  if return_code_node_output != 0:
467
500
  xpk_exit(return_code_node_output)
@@ -472,7 +505,6 @@ def cluster_describe(args) -> None:
472
505
  "kubectl get pod -o=custom-columns='Status:.status.phase' | grep -i"
473
506
  ' Running | wc -l',
474
507
  'Count TPU Pods',
475
- args,
476
508
  )
477
509
  if return_code_pod_output != 0:
478
510
  xpk_exit(return_code_pod_output)
@@ -487,7 +519,7 @@ def cluster_describe(args) -> None:
487
519
  xpk_exit(0)
488
520
 
489
521
 
490
- def nodepools_build_table(args) -> tuple[int, list[list]]:
522
+ def nodepools_build_table() -> tuple[int, list[list]]:
491
523
  table = [[
492
524
  'NODEPOOL_NAME',
493
525
  'SLICE',
@@ -499,14 +531,14 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
499
531
 
500
532
  nodepools_data = {}
501
533
 
502
- nodepools, return_code = get_node_pools_name(args)
534
+ nodepools, return_code = get_node_pools_name()
503
535
  if return_code != 0:
504
536
  xpk_print(f'Get node pools name returned ERROR {return_code}')
505
537
 
506
538
  for name in nodepools:
507
539
  nodepools_data[name] = [name]
508
540
 
509
- slices, return_code = get_slice_node_pool_size(args)
541
+ slices, return_code = get_slice_node_pool_size()
510
542
  if return_code != 0:
511
543
  xpk_print(f'Get slice node pool size returned ERROR {return_code}')
512
544
 
@@ -515,7 +547,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
515
547
  count, nodepool_name = s[0], s[1]
516
548
  nodepools_data[nodepool_name].append(count)
517
549
 
518
- type_nodepool, return_code = get_node_pool_instance_type(args)
550
+ type_nodepool, return_code = get_node_pool_instance_type()
519
551
  if return_code != 0:
520
552
  xpk_print(f'Get node pool instance type returned ERROR {return_code}')
521
553
 
@@ -524,7 +556,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
524
556
  nodepool_name, instance_type = tn[0], tn[1]
525
557
  nodepools_data[nodepool_name].append(instance_type)
526
558
 
527
- expected_healthy_nodes, return_code = get_expected_healthy_nodes(args)
559
+ expected_healthy_nodes, return_code = get_expected_healthy_nodes()
528
560
  if return_code != 0:
529
561
  xpk_print(f'Get expected healthy nodes returned ERROR {return_code}')
530
562
 
@@ -533,7 +565,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
533
565
  count, nodepool_name = ehn[0], ehn[1]
534
566
  nodepools_data[nodepool_name].append(count)
535
567
 
536
- actual_healthy_nodes, return_code = get_actual_healthy_nodes(args)
568
+ actual_healthy_nodes, return_code = get_actual_healthy_nodes()
537
569
  if return_code != 0:
538
570
  xpk_print(f'Get actual healthy nodes returned ERROR {return_code}')
539
571
 
@@ -542,7 +574,7 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
542
574
  count, nodepool_name = ahn[0], ahn[1]
543
575
  nodepools_data[nodepool_name].append(count)
544
576
 
545
- total_nodes, return_code = get_total_nodes_per_node_pool(args)
577
+ total_nodes, return_code = get_total_nodes_per_node_pool()
546
578
  if return_code != 0:
547
579
  xpk_print(f'Get total nodes per node pool returned ERROR {return_code}')
548
580
 
@@ -557,20 +589,20 @@ def nodepools_build_table(args) -> tuple[int, list[list]]:
557
589
  return 0, table
558
590
 
559
591
 
560
- def get_node_pools_name(args) -> tuple[list[str], int]:
592
+ def get_node_pools_name() -> tuple[list[str], int]:
561
593
  cmd_nodepools = (
562
594
  'kubectl get node --no-headers=true -o'
563
595
  " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool'"
564
596
  " | grep -v 'none' | sort | uniq"
565
597
  )
566
- return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list', args)
598
+ return_code, out = run_command_for_value(cmd_nodepools, 'Nodepool list')
567
599
  if return_code != 0:
568
600
  return [], return_code
569
601
 
570
602
  return out.splitlines(), 0
571
603
 
572
604
 
573
- def get_slice_node_pool_size(args) -> tuple[list[str], int]:
605
+ def get_slice_node_pool_size() -> tuple[list[str], int]:
574
606
  cmd_slices = (
575
607
  'kubectl get node --no-headers=true -o'
576
608
  " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
@@ -579,7 +611,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
579
611
  ' | uniq -c'
580
612
  )
581
613
  return_code, out = run_command_for_value(
582
- cmd_slices, 'Count nodes per nodepool slice', args
614
+ cmd_slices, 'Count nodes per nodepool slice'
583
615
  )
584
616
  if return_code != 0:
585
617
  return [], return_code
@@ -587,7 +619,7 @@ def get_slice_node_pool_size(args) -> tuple[list[str], int]:
587
619
  return out.splitlines(), 0
588
620
 
589
621
 
590
- def get_node_pool_instance_type(args) -> tuple[list[str], int]:
622
+ def get_node_pool_instance_type() -> tuple[list[str], int]:
591
623
  cmd_type_nodepool = (
592
624
  'kubectl get node --no-headers=true -o'
593
625
  " custom-columns='NODEPOOL:.metadata.labels.cloud\\.google\\.com/gke-nodepool,"
@@ -595,7 +627,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
595
627
  " 'none' | sort | uniq"
596
628
  )
597
629
  return_code, out = run_command_for_value(
598
- cmd_type_nodepool, 'Instance type of nodepools', args
630
+ cmd_type_nodepool, 'Instance type of nodepools'
599
631
  )
600
632
  if return_code != 0:
601
633
  return [], return_code
@@ -603,7 +635,7 @@ def get_node_pool_instance_type(args) -> tuple[list[str], int]:
603
635
  return out.splitlines(), 0
604
636
 
605
637
 
606
- def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
638
+ def get_expected_healthy_nodes() -> tuple[list[str], int]:
607
639
  cmd_expected_healthy_nodes = (
608
640
  'kubectl get node --no-headers=true -o'
609
641
  " custom-columns=':metadata.labels.cloud\\.google\\.com/gke-nodepool'"
@@ -614,7 +646,6 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
614
646
  return_code, out = run_command_for_value(
615
647
  cmd_expected_healthy_nodes,
616
648
  'Count expected healthy nodes per nodepool',
617
- args,
618
649
  )
619
650
  if return_code != 0:
620
651
  return [], return_code
@@ -622,7 +653,7 @@ def get_expected_healthy_nodes(args) -> tuple[list[str], int]:
622
653
  return out.splitlines(), 0
623
654
 
624
655
 
625
- def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
656
+ def get_actual_healthy_nodes() -> tuple[list[str], int]:
626
657
  cmd_actual_healthy_nodes = (
627
658
  'kubectl get node --no-headers=true -o'
628
659
  " custom-columns='NODE_NAME:metadata.name,"
@@ -635,7 +666,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
635
666
  ' | uniq -c'
636
667
  )
637
668
  return_code, out = run_command_for_value(
638
- cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool', args
669
+ cmd_actual_healthy_nodes, 'Count actual healthy nodes per nodepool'
639
670
  )
640
671
  if return_code != 0:
641
672
  return [], return_code
@@ -643,7 +674,7 @@ def get_actual_healthy_nodes(args) -> tuple[list[str], int]:
643
674
  return out.splitlines(), 0
644
675
 
645
676
 
646
- def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
677
+ def get_total_nodes_per_node_pool() -> tuple[list[str], int]:
647
678
  cmd_total_nodes = (
648
679
  'kubectl get node --no-headers=true -o'
649
680
  " custom-columns='NODE_NAME:metadata.name,"
@@ -655,7 +686,7 @@ def get_total_nodes_per_node_pool(args) -> tuple[list[str], int]:
655
686
  ' | uniq -c'
656
687
  )
657
688
  return_code, out = run_command_for_value(
658
- cmd_total_nodes, 'Count total nodes per nodepool', args
689
+ cmd_total_nodes, 'Count total nodes per nodepool'
659
690
  )
660
691
  if return_code != 0:
661
692
  return [], return_code
@@ -672,6 +703,8 @@ def cluster_list(args) -> None:
672
703
  Returns:
673
704
  0 if successful and 1 otherwise.
674
705
  """
706
+ if should_validate_dependencies(args):
707
+ validate_dependencies_list([SystemDependency.GCLOUD])
675
708
  add_zone_and_project(args)
676
709
  xpk_print(f'For project {args.project} and zone {args.zone}:', flush=True)
677
710
  if run_gke_clusters_list_command(args):
@@ -707,20 +740,20 @@ def cluster_create_ray_cluster(args) -> None:
707
740
  cluster_create(args)
708
741
 
709
742
 
710
- def install_jq(args):
743
+ def install_jq():
711
744
  """Installs 'jq' utility."""
712
745
  if shutil.which('jq'):
713
746
  xpk_print("Task: 'Install jq' skipped, jq already installed.")
714
747
  return
715
748
  command_jq_install = 'sudo apt install jq -y'
716
749
  xpk_print("Task: 'Install jq' in progress.")
717
- return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
750
+ return_code = run_command_with_updates(command_jq_install, 'Install jq')
718
751
  if return_code != 0:
719
752
  xpk_print(f'Install jq error {return_code}')
720
753
  xpk_exit(return_code)
721
754
 
722
755
 
723
- def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
756
+ def clone_coredns_deployment_repo(coredns_repo_full_path: str):
724
757
  """Clones the CoreDNS deployment repository if it doesn't exist."""
725
758
  if os.path.exists(coredns_repo_full_path):
726
759
  xpk_print(
@@ -735,15 +768,13 @@ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
735
768
  "Task: 'Clone deployment' in progress, Target"
736
769
  f' directory:{coredns_repo_full_path}.'
737
770
  )
738
- return_code = run_command_with_updates(
739
- command_git_clone, 'Clone deployment', args
740
- )
771
+ return_code = run_command_with_updates(command_git_clone, 'Clone deployment')
741
772
  if return_code != 0:
742
773
  xpk_print(f'Clone deployment error {return_code}')
743
774
  xpk_exit(return_code)
744
775
 
745
776
 
746
- def deploy_coredns_manifests(args, coredns_k8s_path: str):
777
+ def deploy_coredns_manifests(coredns_k8s_path: str):
747
778
  """Deploys CoreDNS manifests to the cluster."""
748
779
  if not os.path.isdir(coredns_k8s_path):
749
780
  xpk_print(
@@ -761,7 +792,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
761
792
  f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
762
793
  )
763
794
  return_code = run_command_with_updates(
764
- command_deploy_coredns, 'Deploy CoreDNS', args
795
+ command_deploy_coredns, 'Deploy CoreDNS'
765
796
  )
766
797
  if return_code != 0:
767
798
  xpk_print(f'Deploy CoreDNS error {return_code}')
@@ -773,9 +804,7 @@ def deploy_coredns_manifests(args, coredns_k8s_path: str):
773
804
  xpk_exit(return_code)
774
805
 
775
806
 
776
- def scale_down_deployment(
777
- args, deployment_name: str, namespace: str = 'kube-system'
778
- ):
807
+ def scale_down_deployment(deployment_name: str, namespace: str = 'kube-system'):
779
808
  """Scales down a specified Kubernetes deployment to 0 replicas."""
780
809
  command = (
781
810
  f'kubectl scale deployment {deployment_name} --replicas=0'
@@ -783,29 +812,27 @@ def scale_down_deployment(
783
812
  )
784
813
  xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
785
814
  return_code = run_command_with_updates(
786
- command, f'Scale down {deployment_name}', args
815
+ command, f'Scale down {deployment_name}'
787
816
  )
788
817
  if return_code != 0:
789
818
  xpk_print(f'Scale down {deployment_name} error {return_code}')
790
819
  xpk_exit(return_code)
791
- xpk_print(f'\n{deployment_name} has been scaled down.')
820
+ xpk_print(f'{deployment_name} has been scaled down.')
792
821
 
793
822
 
794
- def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
823
+ def scale_up_coredns(replicas: int = 15, namespace: str = 'kube-system'):
795
824
  """Scales up the CoreDNS deployment to a specified number of replicas."""
796
825
  command_coredns_scale = (
797
826
  f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
798
827
  )
799
828
  xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
800
- return_code = run_command_with_updates(
801
- command_coredns_scale, 'Scale CoreDNS', args
802
- )
829
+ return_code = run_command_with_updates(command_coredns_scale, 'Scale CoreDNS')
803
830
  if return_code != 0:
804
831
  xpk_print(f'Scale CoreDNS error {return_code}')
805
832
  xpk_exit(return_code)
806
833
 
807
834
 
808
- def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
835
+ def check_deployment_exists(deployment_name: str, namespace: str) -> bool:
809
836
  """Check for the existence of a specific Deployment in a given namespace."""
810
837
  # TODO: rewrite this to be more obvious, check if it is correct
811
838
  command = (
@@ -813,17 +840,17 @@ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
813
840
  f' {namespace} --ignore-not-found'
814
841
  )
815
842
  result = run_command_with_updates(
816
- command, 'Waiting for kubeDNS to be checked.', args
843
+ command, 'Waiting for kubeDNS to be checked.'
817
844
  )
818
845
  return result != 0
819
846
 
820
847
 
821
848
  def verify_coredns_readiness(
822
- args, timeout: int = 240, namespace: str = 'kube-system'
849
+ timeout: int = 240, namespace: str = 'kube-system'
823
850
  ):
824
851
  """Verifies CoreDNS readiness using kubectl wait commands."""
825
852
  xpk_print('Now verifying CoreDNS readiness...')
826
- kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
853
+ kube_dns_exists = check_deployment_exists('kube-dns', namespace)
827
854
  if kube_dns_exists:
828
855
  # Wait for kube-dns to be fully scaled down
829
856
  command_kube_dns_wait_scaled_down = (
@@ -833,7 +860,7 @@ def verify_coredns_readiness(
833
860
  )
834
861
  xpk_print('Verifying if kube-dns has scaled down...')
835
862
  return_code_kube_dns = run_command_with_updates(
836
- command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
863
+ command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down'
837
864
  )
838
865
  if return_code_kube_dns != 0:
839
866
  xpk_print('kube-dns did not scale down successfully within the timeout.')
@@ -849,7 +876,7 @@ def verify_coredns_readiness(
849
876
  )
850
877
  xpk_print('Verifying if CoreDNS is available...')
851
878
  return_code_coredns = run_command_with_updates(
852
- command_coredns_wait_available, 'Wait for coredns available', args
879
+ command_coredns_wait_available, 'Wait for coredns available'
853
880
  )
854
881
  if return_code_coredns != 0:
855
882
  xpk_print(
@@ -874,12 +901,9 @@ def cleanup_coredns_repo(coredns_repo_full_path: str):
874
901
  xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
875
902
 
876
903
 
877
- def update_coredns(args) -> int:
904
+ def update_coredns() -> int:
878
905
  """Updates and deploys CoreDNS within a cluster.
879
906
 
880
- Args:
881
- args: user provided arguments for running the command.
882
-
883
907
  Returns:
884
908
  0 if successful and 1 otherwise.
885
909
  """
@@ -888,23 +912,23 @@ def update_coredns(args) -> int:
888
912
  coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
889
913
  coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
890
914
  # 1. Install jq
891
- install_jq(args)
915
+ install_jq()
892
916
 
893
917
  # 2. Clone CoreDNS deployment repository
894
- clone_coredns_deployment_repo(args, coredns_repo_full_path)
918
+ clone_coredns_deployment_repo(coredns_repo_full_path)
895
919
 
896
920
  # 3. Deploy CoreDNS to the cluster
897
- deploy_coredns_manifests(args, coredns_k8s_path)
921
+ deploy_coredns_manifests(coredns_k8s_path)
898
922
 
899
923
  # 4. Scale down kube-dns-autoscaler
900
- scale_down_deployment(args, 'kube-dns-autoscaler')
924
+ scale_down_deployment('kube-dns-autoscaler')
901
925
 
902
926
  # 5. Scale down kube-dns
903
- scale_down_deployment(args, 'kube-dns')
927
+ scale_down_deployment('kube-dns')
904
928
 
905
929
  # 6. Scale up coredns and verify readiness
906
- scale_up_coredns(args, replicas=15)
907
- verify_coredns_readiness(args, timeout=120)
930
+ scale_up_coredns(replicas=15)
931
+ verify_coredns_readiness(timeout=120)
908
932
 
909
933
  xpk_print('The CoreDNS setup process has been completed.')
910
934
 
@@ -914,7 +938,7 @@ def update_coredns(args) -> int:
914
938
  return 0
915
939
 
916
940
 
917
- def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
941
+ def coredns_deployment_exists(namespace: str = 'kube-system') -> bool:
918
942
  """Checks if the CoreDNS deployment exists in the given namespace.
919
943
 
920
944
  Args:
@@ -929,10 +953,10 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
929
953
  f' namespace: {namespace}'
930
954
  )
931
955
  return_code = run_command_with_updates(
932
- command, f'Check CoreDNS deployment in {namespace}', args
956
+ command, f'Check CoreDNS deployment in {namespace}'
933
957
  )
934
958
  if return_code == 0:
935
- verify_coredns_readiness(args)
959
+ verify_coredns_readiness()
936
960
  xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
937
961
  return True
938
962
  else:
@@ -943,25 +967,22 @@ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
943
967
  return False
944
968
 
945
969
 
946
- def update_coredns_if_necessary(args) -> int:
970
+ def update_coredns_if_necessary() -> int:
947
971
  """Updates and deploys CoreDNS within the cluster if it's not already present.
948
972
 
949
973
  This function checks for the existence of the CoreDNS deployment.
950
974
  If it's not found, it proceeds to deploy and configure CoreDNS.
951
975
 
952
- Args:
953
- args: User-provided arguments for running the command.
954
-
955
976
  Returns:
956
977
  0 if successful (CoreDNS was already present or successfully deployed),
957
978
  and 1 otherwise.
958
979
  """
959
- if coredns_deployment_exists(args, namespace='kube-system'):
980
+ if coredns_deployment_exists(namespace='kube-system'):
960
981
  xpk_print('Skipping CoreDNS deployment since it already exists.')
961
982
  return 0
962
983
  else:
963
984
  xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
964
- return update_coredns(args)
985
+ return update_coredns()
965
986
 
966
987
 
967
988
  def create_cluster_if_necessary(
@@ -1021,10 +1042,10 @@ def run_gke_cluster_delete_command(args) -> int:
1021
1042
  command = (
1022
1043
  'gcloud beta container clusters delete'
1023
1044
  f' {args.cluster} --project={args.project}'
1024
- f' --region={zone_to_region(args.zone)} --quiet'
1045
+ f' --location={get_cluster_location(args.project, args.cluster, args.zone)} --quiet'
1025
1046
  )
1026
1047
 
1027
- return_code = run_command_with_updates(command, 'Cluster Delete', args)
1048
+ return_code = run_command_with_updates(command, 'Cluster Delete')
1028
1049
  if return_code != 0:
1029
1050
  xpk_print(f'Cluster delete request returned ERROR {return_code}')
1030
1051
  return 1
@@ -1047,9 +1068,9 @@ def run_gke_clusters_list_command(args) -> int:
1047
1068
  """
1048
1069
  command = (
1049
1070
  'gcloud container clusters list'
1050
- f' --project={args.project} --region={zone_to_region(args.zone)}'
1071
+ f' --project={args.project} --filter=location~"{zone_to_region(args.zone)}.*"'
1051
1072
  )
1052
- return_code = run_command_with_updates(command, 'Cluster List', args)
1073
+ return_code = run_command_with_updates(command, 'Cluster List')
1053
1074
  if return_code != 0:
1054
1075
  xpk_print(f'Cluster list request returned ERROR {return_code}')
1055
1076
  return 1
@@ -1105,6 +1126,7 @@ def run_gke_cluster_create_command(
1105
1126
  f' {rapid_release_cmd}'
1106
1127
  ' --enable-dns-access'
1107
1128
  ' --autoscaling-profile=optimize-utilization'
1129
+ ' --labels=gke_product_type=xpk'
1108
1130
  )
1109
1131
 
1110
1132
  enable_ip_alias = False
@@ -1158,7 +1180,7 @@ def run_gke_cluster_create_command(
1158
1180
  addons_str = ','.join(addons)
1159
1181
  command += f' --addons={addons_str}'
1160
1182
 
1161
- return_code = run_command_with_updates(command, 'GKE Cluster Create', args)
1183
+ return_code = run_command_with_updates(command, 'GKE Cluster Create')
1162
1184
  if return_code != 0:
1163
1185
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
1164
1186
  return 1
@@ -1204,12 +1226,12 @@ def install_storage_csis(args):
1204
1226
 
1205
1227
  def install_kjob(args):
1206
1228
  xpk_print('Verifying kjob installation')
1207
- err_code = verify_kjob_installed(args)
1229
+ err_code = verify_kjob_installed()
1208
1230
  if err_code > 0:
1209
1231
  xpk_exit(err_code)
1210
1232
 
1211
1233
  xpk_print('Applying kjob CDRs')
1212
- err_code = apply_kjob_crds(args)
1234
+ err_code = apply_kjob_crds()
1213
1235
  if err_code > 0:
1214
1236
  xpk_exit(err_code)
1215
1237
 
@@ -1220,42 +1242,46 @@ def install_kjob(args):
1220
1242
 
1221
1243
  def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
1222
1244
  xpk_print('Enabling Kueue on the cluster')
1223
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
1224
- if install_kueue_on_cluster_code != 0:
1225
- xpk_exit(install_kueue_on_cluster_code)
1226
-
1227
- xpk_print('Wait for Kueue to be fully available')
1228
- wait_for_kueue_available_code = wait_for_kueue_available(args)
1229
- if wait_for_kueue_available_code != 0:
1230
- xpk_exit(wait_for_kueue_available_code)
1231
-
1232
- xpk_print('Install Kueue Custom Resources')
1233
- enable_kueue_credentials_code = install_kueue_crs(
1234
- args, system, autoprovisioning_config
1245
+ autoprovisioning_enabled = False
1246
+ if autoprovisioning_config:
1247
+ # Determine total resources available based on autoprovisioning max chips.
1248
+ autoprovisioning_enabled = True
1249
+ total_chips = autoprovisioning_config.maximum_chips
1250
+ else:
1251
+ # Determine total chips based on user specified topology.
1252
+ total_chips = get_total_chips_requested_from_args(args, system)
1253
+ kueue_manager = KueueManager()
1254
+ kueue_manager.install_or_upgrade(
1255
+ KueueConfig(
1256
+ system,
1257
+ total_chips=total_chips,
1258
+ autoprovisioning_enabled=autoprovisioning_enabled,
1259
+ num_slices=args.num_slices,
1260
+ flex=args.flex,
1261
+ memory_limit=args.memory_limit,
1262
+ cpu_limit=args.cpu_limit,
1263
+ is_pathways_cluster=args.enable_pathways,
1264
+ configure_sub_slicing=(
1265
+ FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing
1266
+ ),
1267
+ ),
1235
1268
  )
1236
- if enable_kueue_credentials_code != 0:
1237
- xpk_exit(enable_kueue_credentials_code)
1238
-
1239
- xpk_print('Update Kueue Controller Manager resources')
1240
- update_kueue_resources_code = update_kueue_resources_if_necessary(args)
1241
- if update_kueue_resources_code != 0:
1242
- xpk_exit(update_kueue_resources_code)
1243
1269
 
1244
1270
 
1245
- def prepare_gpus(args, system: SystemCharacteristics):
1271
+ def prepare_gpus(system: SystemCharacteristics):
1246
1272
  xpk_print('Installing NCCL Plugin for cluster')
1247
- install_nccl_code = install_nccl_on_cluster(args, system)
1273
+ install_nccl_code = install_nccl_on_cluster(system)
1248
1274
  if install_nccl_code != 0:
1249
1275
  xpk_exit(install_nccl_code)
1250
1276
 
1251
1277
  if system.device_type == H100_DEVICE_TYPE:
1252
1278
  xpk_print('Installing NRI device injector for cluster')
1253
- install_nri_code = install_nri_on_cluster(args)
1279
+ install_nri_code = install_nri_on_cluster()
1254
1280
  if install_nri_code != 0:
1255
1281
  xpk_exit(install_nri_code)
1256
1282
 
1257
1283
  if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
1258
1284
  xpk_print('Disabling MGLRU')
1259
- err_code = disable_mglru_on_cluster(args)
1285
+ err_code = disable_mglru_on_cluster()
1260
1286
  if err_code > 0:
1261
1287
  xpk_exit(err_code)