xpk 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/cluster.py CHANGED
@@ -78,6 +78,8 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
78
78
  from ..utils.file import write_tmp_file
79
79
  from . import cluster_gcluster
80
80
  from .common import set_cluster_command
81
+ import shutil
82
+ import os
81
83
 
82
84
 
83
85
  def cluster_adapt(args) -> None:
@@ -247,6 +249,10 @@ def cluster_create(args) -> None:
247
249
 
248
250
  get_cluster_credentials(args)
249
251
 
252
+ update_coredns_command_code = update_coredns_if_necessary(args)
253
+ if update_coredns_command_code != 0:
254
+ xpk_exit(update_cluster_command_code)
255
+
250
256
  k8s_client = setup_k8s_env(args)
251
257
 
252
258
  install_storage_crd(k8s_client)
@@ -702,6 +708,262 @@ def cluster_create_ray_cluster(args) -> None:
702
708
  cluster_create(args)
703
709
 
704
710
 
711
+ def install_jq(args):
712
+ """Installs 'jq' utility."""
713
+ if shutil.which('jq'):
714
+ xpk_print("Task: 'Install jq' skipped, jq already installed.")
715
+ return
716
+ command_jq_install = 'sudo apt install jq -y'
717
+ xpk_print("Task: 'Install jq' in progress.")
718
+ return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
719
+ if return_code != 0:
720
+ xpk_print(f'Install jq error {return_code}')
721
+ xpk_exit(return_code)
722
+
723
+
724
+ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
725
+ """Clones the CoreDNS deployment repository if it doesn't exist."""
726
+ if os.path.exists(coredns_repo_full_path):
727
+ xpk_print(
728
+ f"Directory '{coredns_repo_full_path}' already exists, skip git clone."
729
+ )
730
+ return
731
+ command_git_clone = (
732
+ 'git clone https://github.com/coredns/deployment.git'
733
+ f' {coredns_repo_full_path}'
734
+ )
735
+ xpk_print(
736
+ "Task: 'Clone deployment' in progress, Target"
737
+ f' directory:{coredns_repo_full_path}.'
738
+ )
739
+ return_code = run_command_with_updates(
740
+ command_git_clone, 'Clone deployment', args
741
+ )
742
+ if return_code != 0:
743
+ xpk_print(f'Clone deployment error {return_code}')
744
+ xpk_exit(return_code)
745
+
746
+
747
+ def deploy_coredns_manifests(args, coredns_k8s_path: str):
748
+ """Deploys CoreDNS manifests to the cluster."""
749
+ if not os.path.isdir(coredns_k8s_path):
750
+ xpk_print(
751
+ f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist."
752
+ ' Has git clone been successful?'
753
+ )
754
+ xpk_exit(1)
755
+ original_cwd = os.getcwd()
756
+ try:
757
+ os.chdir(coredns_k8s_path)
758
+ xpk_print(f'Current working directory changed to: {os.getcwd()}')
759
+
760
+ command_deploy_coredns = './deploy.sh | kubectl apply -f -'
761
+ xpk_print(
762
+ f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
763
+ )
764
+ return_code = run_command_with_updates(
765
+ command_deploy_coredns, 'Deploy CoreDNS', args
766
+ )
767
+ if return_code != 0:
768
+ xpk_print(f'Deploy CoreDNS error {return_code}')
769
+
770
+ finally:
771
+ xpk_print(f'Restoring working directory to: {original_cwd}')
772
+ os.chdir(original_cwd)
773
+ if return_code != 0:
774
+ xpk_exit(return_code)
775
+
776
+
777
+ def scale_down_deployment(
778
+ args, deployment_name: str, namespace: str = 'kube-system'
779
+ ):
780
+ """Scales down a specified Kubernetes deployment to 0 replicas."""
781
+ command = (
782
+ f'kubectl scale deployment {deployment_name} --replicas=0'
783
+ f' --namespace={namespace}'
784
+ )
785
+ xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
786
+ return_code = run_command_with_updates(
787
+ command, f'Scale down {deployment_name}', args
788
+ )
789
+ if return_code != 0:
790
+ xpk_print(f'Scale down {deployment_name} error {return_code}')
791
+ xpk_exit(return_code)
792
+ xpk_print(f'\n{deployment_name} has been scaled down.')
793
+
794
+
795
+ def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
796
+ """Scales up the CoreDNS deployment to a specified number of replicas."""
797
+ command_coredns_scale = (
798
+ f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
799
+ )
800
+ xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
801
+ return_code = run_command_with_updates(
802
+ command_coredns_scale, 'Scale CoreDNS', args
803
+ )
804
+ if return_code != 0:
805
+ xpk_print(f'Scale CoreDNS error {return_code}')
806
+ xpk_exit(return_code)
807
+
808
+
809
+ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
810
+ """Check for the existence of a specific Deployment in a given namespace."""
811
+ command = (
812
+ f'kubectl get deployment {deployment_name} -n'
813
+ f' {namespace} --ignore-not-found'
814
+ )
815
+ result = run_command_with_updates(
816
+ command, 'Waiting for kubeDNS to be checked.', args
817
+ )
818
+ return result
819
+
820
+
821
+ def verify_coredns_readiness(
822
+ args, timeout: int = 120, namespace: str = 'kube-system'
823
+ ):
824
+ """Verifies CoreDNS readiness using kubectl wait commands."""
825
+ xpk_print('Now verifying CoreDNS readiness...')
826
+ kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
827
+ if kube_dns_exists:
828
+ # Wait for kube-dns to be fully scaled down
829
+ command_kube_dns_wait_scaled_down = (
830
+ 'kubectl wait deployment/kube-dns'
831
+ " --for=jsonpath='{.status.replicas}'=0"
832
+ f' --namespace={namespace} --timeout={timeout}s'
833
+ )
834
+ xpk_print('Verifying if kube-dns has scaled down...')
835
+ return_code_kube_dns = run_command_with_updates(
836
+ command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
837
+ )
838
+ if return_code_kube_dns != 0:
839
+ xpk_print('kube-dns did not scale down successfully within the timeout.')
840
+ xpk_exit(1) # Exit if kube-dns cannot scale down
841
+ else:
842
+ xpk_print('kube-dns has successfully scaled down.')
843
+ else:
844
+ xpk_print('kube-dns deployment not found.')
845
+ # Wait for CoreDNS to be fully scaled up and available
846
+ command_coredns_wait_available = (
847
+ 'kubectl wait deployment/coredns --for=condition=Available=true'
848
+ f' --namespace={namespace} --timeout={timeout}s'
849
+ )
850
+ xpk_print('Verifying if CoreDNS is available...')
851
+ return_code_coredns = run_command_with_updates(
852
+ command_coredns_wait_available, 'Wait for coredns available', args
853
+ )
854
+ if return_code_coredns != 0:
855
+ xpk_print(
856
+ 'CoreDNS verification failed, it might not have fully started within'
857
+ ' the timeout.'
858
+ )
859
+ xpk_exit(1) # Exit if coredns cannot become available
860
+
861
+ xpk_print('CoreDNS has successfully started and passed verification.')
862
+
863
+
864
+ def cleanup_coredns_repo(coredns_repo_full_path: str):
865
+ """Deletes the cloned CoreDNS deployment directory."""
866
+ xpk_print(
867
+ "Task: 'Deleting CoreDNS deployment directory' in progress:"
868
+ f' {coredns_repo_full_path}'
869
+ )
870
+ try:
871
+ shutil.rmtree(coredns_repo_full_path)
872
+ xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}')
873
+ except OSError as e:
874
+ xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
875
+
876
+
877
+ def update_coredns(args):
878
+ """Updates and deploys CoreDNS within a cluster.
879
+
880
+ Args:
881
+ args: user provided arguments for running the command.
882
+
883
+ Returns:
884
+ 0 if successful and 1 otherwise.
885
+ """
886
+ coredns_repo_dir = os.path.expanduser('/tmp/')
887
+ coredns_repo_dir_name = 'deployment'
888
+ coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
889
+ coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
890
+ # 1. Install jq
891
+ install_jq(args)
892
+
893
+ # 2. Clone CoreDNS deployment repository
894
+ clone_coredns_deployment_repo(args, coredns_repo_full_path)
895
+
896
+ # 3. Deploy CoreDNS to the cluster
897
+ deploy_coredns_manifests(args, coredns_k8s_path)
898
+
899
+ # 4. Scale down kube-dns-autoscaler
900
+ scale_down_deployment(args, 'kube-dns-autoscaler')
901
+
902
+ # 5. Scale down kube-dns
903
+ scale_down_deployment(args, 'kube-dns')
904
+
905
+ # 6. Scale up coredns and verify readiness
906
+ scale_up_coredns(args, replicas=15)
907
+ verify_coredns_readiness(args, timeout=120)
908
+
909
+ xpk_print('The CoreDNS setup process has been completed.')
910
+
911
+ # 7. Cleanup
912
+ cleanup_coredns_repo(coredns_repo_full_path)
913
+
914
+ return 0
915
+
916
+
917
+ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
918
+ """Checks if the CoreDNS deployment exists in the given namespace.
919
+
920
+ Args:
921
+ namespace: The Kubernetes namespace to check for the CoreDNS deployment.
922
+
923
+ Returns:
924
+ True if the 'coredns' deployment exists, False otherwise.
925
+ """
926
+ command = f'kubectl get deployment coredns -n {namespace}'
927
+ xpk_print(
928
+ "Task: 'Checking CoreDNS deployment existence' in progress for"
929
+ f' namespace: {namespace}'
930
+ )
931
+ return_code = run_command_with_updates(
932
+ command, f'Check CoreDNS deployment in {namespace}', args
933
+ )
934
+ if return_code == 0:
935
+ verify_coredns_readiness(args)
936
+ xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
937
+ return True
938
+ else:
939
+ xpk_print(
940
+ f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or"
941
+ ' an error occurred.'
942
+ )
943
+ return False
944
+
945
+
946
+ def update_coredns_if_necessary(args) -> int:
947
+ """Updates and deploys CoreDNS within the cluster if it's not already present.
948
+
949
+ This function checks for the existence of the CoreDNS deployment.
950
+ If it's not found, it proceeds to deploy and configure CoreDNS.
951
+
952
+ Args:
953
+ args: User-provided arguments for running the command.
954
+
955
+ Returns:
956
+ 0 if successful (CoreDNS was already present or successfully deployed),
957
+ and 1 otherwise.
958
+ """
959
+ if coredns_deployment_exists(args, namespace='kube-system'):
960
+ xpk_print('Skipping CoreDNS deployment since it already exists.')
961
+ return 0
962
+ else:
963
+ xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
964
+ return update_coredns(args)
965
+
966
+
705
967
  def create_cluster_if_necessary(
706
968
  args, gke_control_plane_version: str, system: SystemCharacteristics
707
969
  ) -> int:
@@ -842,6 +1104,7 @@ def run_gke_cluster_create_command(
842
1104
  f' {args.custom_cluster_arguments}'
843
1105
  f' {rapid_release_cmd}'
844
1106
  ' --enable-dns-access'
1107
+ ' --autoscaling-profile=optimize-utilization'
845
1108
  )
846
1109
 
847
1110
  enable_ip_alias = False
xpk/core/capacity.py CHANGED
@@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type(
232
232
  case CapacityType.ON_DEMAND.name:
233
233
  node_selector = ''
234
234
  case CapacityType.FLEX_START.name:
235
- node_selector = 'cloud.google.com/gke-queued="true"'
235
+ node_selector = 'cloud.google.com/gke-queued: "true"'
236
236
  case CapacityType.SPOT.name:
237
- node_selector = 'cloud.google.com/gke-spot="true"'
237
+ node_selector = 'cloud.google.com/gke-spot: "true"'
238
238
  case CapacityType.RESERVATION.name:
239
239
  node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
240
240
  case _:
xpk/core/config.py CHANGED
@@ -22,7 +22,7 @@ from ..utils import file
22
22
  from ..utils.console import xpk_print
23
23
 
24
24
  # This is the version for XPK PyPI package
25
- __version__ = 'v0.10.1'
25
+ __version__ = 'v0.11.0'
26
26
  XPK_CURRENT_VERSION = __version__
27
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
28
28
 
xpk/core/jobset.py CHANGED
@@ -81,7 +81,7 @@ spec:
81
81
  limits:
82
82
  memory: {memory_limit_size}
83
83
  requests:
84
- cpu: 500m
84
+ cpu: 1000m
85
85
  memory: 128Mi
86
86
  securityContext:
87
87
  allowPrivilegeEscalation: false
xpk/core/kueue.py CHANGED
@@ -244,14 +244,16 @@ spec:
244
244
  periodSeconds: 10
245
245
  resources:
246
246
  limits:
247
- cpu: 500m
247
+ cpu: 1000m
248
248
  memory: {memory_limit_size}
249
249
  requests:
250
- cpu: 500m
250
+ cpu: 1000m
251
251
  memory: 512Mi
252
252
  securityContext:
253
253
  allowPrivilegeEscalation: false
254
254
  volumeMounts:
255
+ - mountPath: /visibility
256
+ name: visibility
255
257
  - mountPath: /tmp/k8s-webhook-server/serving-certs
256
258
  name: cert
257
259
  readOnly: true
@@ -263,6 +265,8 @@ spec:
263
265
  serviceAccountName: kueue-controller-manager
264
266
  terminationGracePeriodSeconds: 10
265
267
  volumes:
268
+ - name: visibility
269
+ emptyDir: {{}}
266
270
  - name: cert
267
271
  secret:
268
272
  defaultMode: 420
xpk/core/nap.py CHANGED
@@ -42,6 +42,8 @@ AUTOPROVISIONING_CONFIG_FILE = """
42
42
  management:
43
43
  autoRepair: true
44
44
  autoUpgrade: true
45
+ scopes:
46
+ - "https://www.googleapis.com/auth/devstorage.read_write"
45
47
  autoprovisioningLocations:
46
48
  {zones}
47
49
  {resource_limits}
@@ -106,6 +108,18 @@ def enable_autoprovisioning_on_cluster(
106
108
  xpk_print(f'{task} request returned ERROR {return_code}')
107
109
  return autoprovisioning_config, return_code
108
110
 
111
+ command = (
112
+ 'gcloud container clusters update'
113
+ f' {args.cluster} --project={args.project}'
114
+ f' --region={zone_to_region(args.zone)}'
115
+ ' --autoscaling-profile=optimize-utilization'
116
+ )
117
+ task = 'Update cluster with autoscaling-profile'
118
+ return_code = run_command_with_updates(command, task, args)
119
+ if return_code != 0:
120
+ xpk_print(f'{task} request returned ERROR {return_code}')
121
+ return autoprovisioning_config, return_code
122
+
109
123
  # Update created accelerator node pools to support autoprovisioning.
110
124
  existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
111
125
  if return_code != 0:
@@ -171,11 +185,11 @@ def create_autoprovisioning_config(
171
185
  # is not controlled by NAP.
172
186
  cpu_limits = """
173
187
  minimum: 1
174
- maximum: 10000
188
+ maximum: 1000000
175
189
  """
176
190
  memory_limits = """
177
191
  minimum: 1
178
- maximum: 10000
192
+ maximum: 10000000
179
193
  """
180
194
 
181
195
  # By default, the maximum chips is set to be the current number of resources used
xpk/core/nodepool.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from typing import List
17
18
  from ..utils.console import get_user_input, xpk_print
18
19
  from .capacity import (
19
20
  AUTOPROVISIONING_CONFIG_VALUE,
@@ -32,6 +33,8 @@ from .resources import (
32
33
  create_or_update_cluster_configmap,
33
34
  )
34
35
  from .system_characteristics import AcceleratorType
36
+ from functools import reduce
37
+ from operator import mul
35
38
 
36
39
  CLOUD_PLATFORM_AUTH_SCOPE_URL = (
37
40
  '"https://www.googleapis.com/auth/cloud-platform"'
@@ -88,20 +91,26 @@ def run_gke_node_pool_create_command(
88
91
  xpk_print('Parsing capacity arguments failed!')
89
92
  return return_code
90
93
 
91
- if system.accelerator_type == AcceleratorType['GPU']:
92
- xpk_print(
93
- f'Creating 1 node pool with {args.num_nodes} nodes of'
94
- f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
95
- )
96
- desired_node_pool_names = [f'{args.cluster}-np-0']
97
- else:
98
- xpk_print(
99
- f'Creating {args.num_slices} node pool or pools of'
100
- f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
101
- )
102
- desired_node_pool_names = [
103
- f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
104
- ]
94
+ desired_node_pool_count = (
95
+ 1
96
+ if system.accelerator_type == AcceleratorType['GPU']
97
+ else args.num_slices
98
+ )
99
+ message = (
100
+ (
101
+ f'Creating 1 node pool with {args.num_nodes} nodes of'
102
+ f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
103
+ )
104
+ if system.accelerator_type == AcceleratorType['GPU']
105
+ else (
106
+ f'Creating {args.num_slices} node pool or pools of'
107
+ f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
108
+ )
109
+ )
110
+ xpk_print(message)
111
+ desired_node_pool_names = get_desired_node_pool_names(
112
+ existing_node_pool_names, args.cluster, desired_node_pool_count
113
+ )
105
114
 
106
115
  node_pools_to_remain = []
107
116
  delete_commands = []
@@ -275,20 +284,24 @@ def run_gke_node_pool_create_command(
275
284
  f' --host-maintenance-interval={args.host_maintenance_interval}'
276
285
  f' {capacity_args}'
277
286
  ' --enable-gvnic'
278
- f' {args.custom_nodepool_arguments}'
279
287
  )
280
288
  if system.accelerator_type == AcceleratorType['TPU']:
281
289
  command += f' --node-version={gke_node_pool_version}'
290
+ topology_product = reduce(
291
+ mul, (int(x) for x in system.topology.split('x')), 1
292
+ )
282
293
  if capacity_type == CapacityType.FLEX_START:
283
294
  command += ' --num-nodes=0'
284
- else:
295
+ elif topology_product > 1:
285
296
  command += f' --num-nodes={system.vms_per_slice}'
286
- command += ' --placement-type=COMPACT --max-pods-per-node 15'
287
297
  command += (
288
298
  f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
289
299
  )
290
- command += f' --tpu-topology={system.topology}'
291
- command += f' {args.custom_tpu_nodepool_arguments}'
300
+
301
+ if topology_product > 1:
302
+ command += ' --placement-type=COMPACT --max-pods-per-node 15'
303
+ command += f' --tpu-topology={system.topology}'
304
+ command += f' {args.custom_tpu_nodepool_arguments}'
292
305
  elif system.accelerator_type == AcceleratorType['GPU']:
293
306
  subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
294
307
  if capacity_type == CapacityType.FLEX_START:
@@ -319,6 +332,8 @@ def run_gke_node_pool_create_command(
319
332
  if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
320
333
  command += ' --workload-metadata=GKE_METADATA'
321
334
 
335
+ command += f' {args.custom_nodepool_arguments}'
336
+
322
337
  task = f'NodepoolCreate-{node_pool_name}'
323
338
  create_commands.append(command)
324
339
  create_task_names.append(task)
@@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode(
594
609
  return 1, None
595
610
 
596
611
  return 0, nodepool_WI_mode.strip()
612
+
613
+
614
+ def get_desired_node_pool_names(
615
+ existing_node_pool_names: List[str],
616
+ cluster_name: str,
617
+ desired_node_pool_count: int,
618
+ ) -> List[str]:
619
+ cluster_node_pools = [
620
+ np
621
+ for np in existing_node_pool_names
622
+ if np.startswith(f'{cluster_name}-np-')
623
+ ]
624
+ result = set(cluster_node_pools[:desired_node_pool_count])
625
+ i = 0
626
+ while len(result) < desired_node_pool_count:
627
+ result.add(f'{cluster_name}-np-{i}')
628
+ i += 1
629
+ return list(result)
xpk/core/scheduling.py CHANGED
@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
49
49
  missing_gke_accelerator_type = False
50
50
  if not cluster_config_map.get(system.gke_accelerator):
51
51
  xpk_print(
52
- f'Gke Accelerator Type Check: {args.workload} is requesting'
52
+ f'GKE Accelerator Type Check: {args.workload} is requesting'
53
53
  f' {system.gke_accelerator} but cluster only contains'
54
54
  f' {cluster_config_map.keys()}. '
55
55
  )