xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. xpk/commands/cluster.py +270 -8
  2. xpk/commands/cluster_gcluster.py +2 -1
  3. xpk/commands/common.py +3 -3
  4. xpk/commands/info.py +12 -12
  5. xpk/commands/job.py +12 -10
  6. xpk/commands/kjob_common.py +2 -1
  7. xpk/commands/storage.py +1 -1
  8. xpk/commands/workload.py +12 -6
  9. xpk/core/blueprint/blueprint_generator.py +7 -7
  10. xpk/core/blueprint/blueprint_test.py +218 -0
  11. xpk/core/capacity.py +5 -3
  12. xpk/core/cluster.py +9 -7
  13. xpk/core/cluster_private.py +5 -1
  14. xpk/core/commands.py +3 -3
  15. xpk/core/config.py +3 -4
  16. xpk/core/config_test.py +71 -0
  17. xpk/core/docker_manager.py +1 -1
  18. xpk/core/docker_resources.py +1 -1
  19. xpk/core/filestore.py +7 -2
  20. xpk/core/gcloud_context.py +2 -2
  21. xpk/core/jobset.py +1 -1
  22. xpk/core/kjob.py +2 -1
  23. xpk/core/kueue.py +12 -4
  24. xpk/core/nap.py +20 -6
  25. xpk/core/nodepool.py +52 -19
  26. xpk/core/nodepool_test.py +82 -0
  27. xpk/core/resources.py +1 -7
  28. xpk/core/scheduling.py +1 -1
  29. xpk/core/storage.py +14 -14
  30. xpk/core/system_characteristics.py +267 -1081
  31. xpk/core/workload.py +11 -0
  32. xpk/core/workload_decorators/rdma_decorator.py +3 -2
  33. xpk/core/workload_decorators/storage_decorator.py +2 -1
  34. xpk/core/workload_decorators/tcpx_decorator.py +4 -2
  35. xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
  36. xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
  37. xpk/core/workload_test.py +28 -0
  38. xpk/main.py +9 -10
  39. xpk/parser/cluster.py +67 -49
  40. xpk/parser/common.py +45 -36
  41. xpk/parser/storage.py +12 -13
  42. xpk/parser/workload.py +57 -39
  43. xpk/utils/console.py +2 -1
  44. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
  45. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
  46. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
  47. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
  48. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
  49. {xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0
xpk/commands/cluster.py CHANGED
@@ -78,6 +78,8 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
78
78
  from ..utils.file import write_tmp_file
79
79
  from . import cluster_gcluster
80
80
  from .common import set_cluster_command
81
+ import shutil
82
+ import os
81
83
 
82
84
 
83
85
  def cluster_adapt(args) -> None:
@@ -90,7 +92,7 @@ def cluster_adapt(args) -> None:
90
92
 
91
93
  system, return_code = get_system_characteristics(args)
92
94
 
93
- if return_code > 0:
95
+ if return_code > 0 or system is None:
94
96
  xpk_print('Fetching system characteristics failed!')
95
97
  xpk_exit(return_code)
96
98
 
@@ -139,8 +141,6 @@ def cluster_adapt(args) -> None:
139
141
  if not tensorboard_config:
140
142
  xpk_exit(1)
141
143
 
142
- # Provision node pools dynamically based on incoming workloads:
143
- # Currently autoprovisioning is not supported with Pathways.
144
144
  autoprovisioning_config = None
145
145
  if args.enable_autoprovisioning:
146
146
  xpk_print('Enabling Autoprovisioning')
@@ -199,7 +199,7 @@ def cluster_create(args) -> None:
199
199
  """
200
200
  system, return_code = get_system_characteristics(args)
201
201
 
202
- if return_code > 0:
202
+ if return_code > 0 or system is None:
203
203
  xpk_print('Fetching system characteristics failed!')
204
204
  xpk_exit(return_code)
205
205
 
@@ -215,13 +215,13 @@ def cluster_create(args) -> None:
215
215
  xpk_exit(0)
216
216
 
217
217
  return_code, gke_server_config = get_gke_server_config(args)
218
- if return_code != 0:
218
+ if return_code != 0 or gke_server_config is None:
219
219
  xpk_exit(return_code)
220
220
 
221
221
  return_code, gke_control_plane_version = get_gke_control_plane_version(
222
222
  args, gke_server_config
223
223
  )
224
- if return_code != 0:
224
+ if return_code != 0 or gke_control_plane_version is None:
225
225
  xpk_exit(return_code)
226
226
 
227
227
  create_cluster_command_code = create_cluster_if_necessary(
@@ -247,6 +247,10 @@ def cluster_create(args) -> None:
247
247
 
248
248
  get_cluster_credentials(args)
249
249
 
250
+ update_coredns_command_code = update_coredns_if_necessary(args)
251
+ if update_coredns_command_code != 0:
252
+ xpk_exit(update_cluster_command_code)
253
+
250
254
  k8s_client = setup_k8s_env(args)
251
255
 
252
256
  install_storage_crd(k8s_client)
@@ -288,7 +292,7 @@ def cluster_create(args) -> None:
288
292
  # Provision node pools dynamically based on incoming workloads:
289
293
  # Currently autoprovisioning is not supported with Pathways.
290
294
  autoprovisioning_config = None
291
- if not args.enable_pathways and args.enable_autoprovisioning:
295
+ if args.enable_autoprovisioning:
292
296
  xpk_print('Enabling Autoprovisioning')
293
297
  autoprovisioning_config, return_code = enable_autoprovisioning_on_cluster(
294
298
  args, system
@@ -392,7 +396,7 @@ def cluster_cacheimage(args) -> None:
392
396
  get_cluster_credentials(args)
393
397
  system, return_code = get_system_characteristics(args)
394
398
 
395
- if return_code > 0:
399
+ if return_code > 0 or system is None:
396
400
  xpk_print('Fetching system characteristics failed!')
397
401
  xpk_exit(return_code)
398
402
 
@@ -702,6 +706,263 @@ def cluster_create_ray_cluster(args) -> None:
702
706
  cluster_create(args)
703
707
 
704
708
 
709
+ def install_jq(args):
710
+ """Installs 'jq' utility."""
711
+ if shutil.which('jq'):
712
+ xpk_print("Task: 'Install jq' skipped, jq already installed.")
713
+ return
714
+ command_jq_install = 'sudo apt install jq -y'
715
+ xpk_print("Task: 'Install jq' in progress.")
716
+ return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
717
+ if return_code != 0:
718
+ xpk_print(f'Install jq error {return_code}')
719
+ xpk_exit(return_code)
720
+
721
+
722
+ def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
723
+ """Clones the CoreDNS deployment repository if it doesn't exist."""
724
+ if os.path.exists(coredns_repo_full_path):
725
+ xpk_print(
726
+ f"Directory '{coredns_repo_full_path}' already exists, skip git clone."
727
+ )
728
+ return
729
+ command_git_clone = (
730
+ 'git clone https://github.com/coredns/deployment.git'
731
+ f' {coredns_repo_full_path}'
732
+ )
733
+ xpk_print(
734
+ "Task: 'Clone deployment' in progress, Target"
735
+ f' directory:{coredns_repo_full_path}.'
736
+ )
737
+ return_code = run_command_with_updates(
738
+ command_git_clone, 'Clone deployment', args
739
+ )
740
+ if return_code != 0:
741
+ xpk_print(f'Clone deployment error {return_code}')
742
+ xpk_exit(return_code)
743
+
744
+
745
+ def deploy_coredns_manifests(args, coredns_k8s_path: str):
746
+ """Deploys CoreDNS manifests to the cluster."""
747
+ if not os.path.isdir(coredns_k8s_path):
748
+ xpk_print(
749
+ f"Error:CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist."
750
+ ' Has git clone been successful?'
751
+ )
752
+ xpk_exit(1)
753
+ original_cwd = os.getcwd()
754
+ try:
755
+ os.chdir(coredns_k8s_path)
756
+ xpk_print(f'Current working directory changed to: {os.getcwd()}')
757
+
758
+ command_deploy_coredns = './deploy.sh | kubectl apply -f -'
759
+ xpk_print(
760
+ f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
761
+ )
762
+ return_code = run_command_with_updates(
763
+ command_deploy_coredns, 'Deploy CoreDNS', args
764
+ )
765
+ if return_code != 0:
766
+ xpk_print(f'Deploy CoreDNS error {return_code}')
767
+
768
+ finally:
769
+ xpk_print(f'Restoring working directory to: {original_cwd}')
770
+ os.chdir(original_cwd)
771
+ if return_code != 0:
772
+ xpk_exit(return_code)
773
+
774
+
775
+ def scale_down_deployment(
776
+ args, deployment_name: str, namespace: str = 'kube-system'
777
+ ):
778
+ """Scales down a specified Kubernetes deployment to 0 replicas."""
779
+ command = (
780
+ f'kubectl scale deployment {deployment_name} --replicas=0'
781
+ f' --namespace={namespace}'
782
+ )
783
+ xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
784
+ return_code = run_command_with_updates(
785
+ command, f'Scale down {deployment_name}', args
786
+ )
787
+ if return_code != 0:
788
+ xpk_print(f'Scale down {deployment_name} error {return_code}')
789
+ xpk_exit(return_code)
790
+ xpk_print(f'\n{deployment_name} has been scaled down.')
791
+
792
+
793
+ def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
794
+ """Scales up the CoreDNS deployment to a specified number of replicas."""
795
+ command_coredns_scale = (
796
+ f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
797
+ )
798
+ xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
799
+ return_code = run_command_with_updates(
800
+ command_coredns_scale, 'Scale CoreDNS', args
801
+ )
802
+ if return_code != 0:
803
+ xpk_print(f'Scale CoreDNS error {return_code}')
804
+ xpk_exit(return_code)
805
+
806
+
807
+ def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
808
+ """Check for the existence of a specific Deployment in a given namespace."""
809
+ # TODO: rewrite this to be more obvious, check if it is correct
810
+ command = (
811
+ f'kubectl get deployment {deployment_name} -n'
812
+ f' {namespace} --ignore-not-found'
813
+ )
814
+ result = run_command_with_updates(
815
+ command, 'Waiting for kubeDNS to be checked.', args
816
+ )
817
+ return result != 0
818
+
819
+
820
+ def verify_coredns_readiness(
821
+ args, timeout: int = 240, namespace: str = 'kube-system'
822
+ ):
823
+ """Verifies CoreDNS readiness using kubectl wait commands."""
824
+ xpk_print('Now verifying CoreDNS readiness...')
825
+ kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
826
+ if kube_dns_exists:
827
+ # Wait for kube-dns to be fully scaled down
828
+ command_kube_dns_wait_scaled_down = (
829
+ 'kubectl wait deployment/kube-dns'
830
+ " --for=jsonpath='{.status.replicas}'=0"
831
+ f' --namespace={namespace} --timeout={timeout}s'
832
+ )
833
+ xpk_print('Verifying if kube-dns has scaled down...')
834
+ return_code_kube_dns = run_command_with_updates(
835
+ command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
836
+ )
837
+ if return_code_kube_dns != 0:
838
+ xpk_print('kube-dns did not scale down successfully within the timeout.')
839
+ xpk_exit(1) # Exit if kube-dns cannot scale down
840
+ else:
841
+ xpk_print('kube-dns has successfully scaled down.')
842
+ else:
843
+ xpk_print('kube-dns deployment not found.')
844
+ # Wait for CoreDNS to be fully scaled up and available
845
+ command_coredns_wait_available = (
846
+ 'kubectl wait deployment/coredns --for=condition=Available=true'
847
+ f' --namespace={namespace} --timeout={timeout}s'
848
+ )
849
+ xpk_print('Verifying if CoreDNS is available...')
850
+ return_code_coredns = run_command_with_updates(
851
+ command_coredns_wait_available, 'Wait for coredns available', args
852
+ )
853
+ if return_code_coredns != 0:
854
+ xpk_print(
855
+ 'CoreDNS verification failed, it might not have fully started within'
856
+ ' the timeout.'
857
+ )
858
+ xpk_exit(1) # Exit if coredns cannot become available
859
+
860
+ xpk_print('CoreDNS has successfully started and passed verification.')
861
+
862
+
863
+ def cleanup_coredns_repo(coredns_repo_full_path: str):
864
+ """Deletes the cloned CoreDNS deployment directory."""
865
+ xpk_print(
866
+ "Task: 'Deleting CoreDNS deployment directory' in progress:"
867
+ f' {coredns_repo_full_path}'
868
+ )
869
+ try:
870
+ shutil.rmtree(coredns_repo_full_path)
871
+ xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}')
872
+ except OSError as e:
873
+ xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
874
+
875
+
876
+ def update_coredns(args) -> int:
877
+ """Updates and deploys CoreDNS within a cluster.
878
+
879
+ Args:
880
+ args: user provided arguments for running the command.
881
+
882
+ Returns:
883
+ 0 if successful and 1 otherwise.
884
+ """
885
+ coredns_repo_dir = os.path.expanduser('/tmp/')
886
+ coredns_repo_dir_name = 'deployment'
887
+ coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
888
+ coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
889
+ # 1. Install jq
890
+ install_jq(args)
891
+
892
+ # 2. Clone CoreDNS deployment repository
893
+ clone_coredns_deployment_repo(args, coredns_repo_full_path)
894
+
895
+ # 3. Deploy CoreDNS to the cluster
896
+ deploy_coredns_manifests(args, coredns_k8s_path)
897
+
898
+ # 4. Scale down kube-dns-autoscaler
899
+ scale_down_deployment(args, 'kube-dns-autoscaler')
900
+
901
+ # 5. Scale down kube-dns
902
+ scale_down_deployment(args, 'kube-dns')
903
+
904
+ # 6. Scale up coredns and verify readiness
905
+ scale_up_coredns(args, replicas=15)
906
+ verify_coredns_readiness(args, timeout=120)
907
+
908
+ xpk_print('The CoreDNS setup process has been completed.')
909
+
910
+ # 7. Cleanup
911
+ cleanup_coredns_repo(coredns_repo_full_path)
912
+
913
+ return 0
914
+
915
+
916
+ def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
917
+ """Checks if the CoreDNS deployment exists in the given namespace.
918
+
919
+ Args:
920
+ namespace: The Kubernetes namespace to check for the CoreDNS deployment.
921
+
922
+ Returns:
923
+ True if the 'coredns' deployment exists, False otherwise.
924
+ """
925
+ command = f'kubectl get deployment coredns -n {namespace}'
926
+ xpk_print(
927
+ "Task: 'Checking CoreDNS deployment existence' in progress for"
928
+ f' namespace: {namespace}'
929
+ )
930
+ return_code = run_command_with_updates(
931
+ command, f'Check CoreDNS deployment in {namespace}', args
932
+ )
933
+ if return_code == 0:
934
+ verify_coredns_readiness(args)
935
+ xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
936
+ return True
937
+ else:
938
+ xpk_print(
939
+ f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or"
940
+ ' an error occurred.'
941
+ )
942
+ return False
943
+
944
+
945
+ def update_coredns_if_necessary(args) -> int:
946
+ """Updates and deploys CoreDNS within the cluster if it's not already present.
947
+
948
+ This function checks for the existence of the CoreDNS deployment.
949
+ If it's not found, it proceeds to deploy and configure CoreDNS.
950
+
951
+ Args:
952
+ args: User-provided arguments for running the command.
953
+
954
+ Returns:
955
+ 0 if successful (CoreDNS was already present or successfully deployed),
956
+ and 1 otherwise.
957
+ """
958
+ if coredns_deployment_exists(args, namespace='kube-system'):
959
+ xpk_print('Skipping CoreDNS deployment since it already exists.')
960
+ return 0
961
+ else:
962
+ xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
963
+ return update_coredns(args)
964
+
965
+
705
966
  def create_cluster_if_necessary(
706
967
  args, gke_control_plane_version: str, system: SystemCharacteristics
707
968
  ) -> int:
@@ -842,6 +1103,7 @@ def run_gke_cluster_create_command(
842
1103
  f' {args.custom_cluster_arguments}'
843
1104
  f' {rapid_release_cmd}'
844
1105
  ' --enable-dns-access'
1106
+ ' --autoscaling-profile=optimize-utilization'
845
1107
  )
846
1108
 
847
1109
  enable_ip_alias = False
@@ -310,4 +310,5 @@ def generate_blueprint(
310
310
  system_node_pool_machine_type=args.default_pool_cpu_machine_type,
311
311
  system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
312
312
  )
313
- return None
313
+ xpk_print('Device type is not supported.')
314
+ xpk_exit(1)
xpk/commands/common.py CHANGED
@@ -50,8 +50,8 @@ def set_cluster_command(args) -> int:
50
50
 
51
51
 
52
52
  def is_TAS_possible(
53
- system_characteristics: SystemCharacteristics,
54
- capacity_type: CapacityType,
53
+ system_characteristics: SystemCharacteristics | None,
54
+ capacity_type: CapacityType | None,
55
55
  flex: bool,
56
56
  ) -> bool:
57
57
  """Check cluster's machine_type and capacity type to determine if Kueue TAS is possible
@@ -71,7 +71,7 @@ def is_TAS_possible(
71
71
  xpk_print('capacity_type data was not found in configmaps.')
72
72
  xpk_exit(1)
73
73
 
74
- if flex:
74
+ if not flex:
75
75
  return False
76
76
 
77
77
  if (
xpk/commands/info.py CHANGED
@@ -51,19 +51,19 @@ def info(args: Namespace) -> None:
51
51
  cqs = run_kueuectl_list_clusterqueue(args)
52
52
  quotas = get_nominal_quotas(cqs)
53
53
 
54
- if lq:
54
+ if lq and lqs is not None:
55
55
  print_formatted_lqs(lqs, quotas)
56
56
 
57
57
  if cq:
58
58
  print_formatted_cqs(cqs, quotas)
59
59
 
60
60
 
61
- def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
61
+ def get_nominal_quotas(cqs: str) -> dict[str, dict[str, str]]:
62
62
  """Get quotas from clusterqueues.
63
63
  This function retrieves how much of resource in each flavor is assigned to cluster queue.
64
64
  It parses flavors of passed cluster queues.
65
65
  Args:
66
- - cqs - list of cluster queues.
66
+ - cqs - string containing a list of cluster queues in JSON format.
67
67
  Returns:
68
68
  - dictionary of cluster queues resources quotas in format:
69
69
  {cq_name:{"flavorName:resourceName":quota}}
@@ -75,7 +75,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
75
75
  xpk_print(cqs)
76
76
  xpk_exit(1)
77
77
 
78
- quotas = {}
78
+ quotas: dict[str, dict] = {}
79
79
  for cq in cq_list:
80
80
  spec = cq['spec']
81
81
  cq_name = cq['metadata']['name']
@@ -89,7 +89,7 @@ def get_nominal_quotas(cqs: list[dict]) -> dict[str, dict[str, str]]:
89
89
  return quotas
90
90
 
91
91
 
92
- def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
92
+ def print_formatted_cqs(cqs: str, nominalQuotas) -> None:
93
93
  try:
94
94
  cq_list = json.loads(cqs)['items']
95
95
  except ValueError:
@@ -105,7 +105,7 @@ def print_formatted_cqs(cqs: list[dict], nominalQuotas) -> None:
105
105
  )
106
106
 
107
107
 
108
- def print_formatted_lqs(lqs: list[dict], nominalQuotas) -> None:
108
+ def print_formatted_lqs(lqs: str, nominalQuotas) -> None:
109
109
  try:
110
110
  lq_list = json.loads(lqs)['items']
111
111
  except ValueError:
@@ -143,18 +143,18 @@ def parse_queue_lists(
143
143
 
144
144
 
145
145
  def get_flavors_resources_reservations(
146
- cq_name: str, flavors_res: list[dict]
146
+ cq_name: str, flavors_res: dict
147
147
  ) -> dict[str, dict[str, str]]:
148
148
  """Get usage of flavors resources.
149
149
  This function parser flavorsReservation section of clusterQueue of LocalQueue.
150
150
  Args:
151
151
  - cq_name - name of ClusterQueue to which flavors belong.
152
- - flavors_res - list of reservations made by flavors
152
+ - flavors_res - dict of reservations made by flavors
153
153
  Returns:
154
154
  Dict containing usage of each resource in flavor for each flavor in cluster or local queue.
155
155
  Dict format: {cq_name: {{flavor:resource}:reservation}}
156
156
  """
157
- reservations = {}
157
+ reservations: dict[str, dict] = {}
158
158
  reservations[cq_name] = {}
159
159
  for flavor_name, flavor_resources_reservation_list in flavors_res.items():
160
160
  for resource in flavor_resources_reservation_list:
@@ -167,15 +167,15 @@ def get_flavors_resources_reservations(
167
167
 
168
168
  def get_flavors_usage(
169
169
  q_entry: dict, res_field: str, flavor_resource_quotas: dict
170
- ) -> list[dict]:
170
+ ) -> dict[str, str]:
171
171
  """Parse q_entry to retrieve list of each resource usage in flavour.
172
172
  Args:
173
173
  q_entry - single entry into either LocalQueue or ClusterQueue structured as json
174
174
  flavor_resource_quotas - nominalQuota of flavors resource usage for each clusterqueue
175
175
  Returns:
176
- list of dicts where each list entry is in format (key, entry) where:
176
+ Dict where for each (key, value):
177
177
  - key is flavorName:resourceName
178
- - entry is flavorResourceReservation/flavorResourceQuota
178
+ - value is string formatted as 'flavorResourceReservation/flavorResourceQuota'
179
179
  """
180
180
  status = q_entry['status']
181
181
  flavors_res = status[res_field]
xpk/commands/job.py CHANGED
@@ -18,6 +18,7 @@ import re
18
18
  import sys
19
19
 
20
20
  from ruamel.yaml import YAML
21
+ from typing import cast
21
22
 
22
23
  from ..core.commands import run_command_for_value, run_command_with_updates
23
24
  from ..core.cluster import get_cluster_credentials
@@ -84,7 +85,7 @@ def job_info(args):
84
85
 
85
86
 
86
87
  def get_profile(job_yaml: dict) -> str:
87
- containers = (
88
+ containers: list[dict] = (
88
89
  job_yaml.get('spec', {})
89
90
  .get('template', {})
90
91
  .get('spec', {})
@@ -96,13 +97,13 @@ def get_profile(job_yaml: dict) -> str:
96
97
 
97
98
 
98
99
  def get_mounts(job_yaml: dict) -> list[dict]:
99
- containers = (
100
+ containers: list[dict] = (
100
101
  job_yaml.get('spec', {})
101
102
  .get('template', {})
102
103
  .get('spec', {})
103
104
  .get('containers', [])
104
105
  )
105
- mounts = next(iter(containers), {}).get('volumeMounts', [])
106
+ mounts: list[dict] = next(iter(containers), {}).get('volumeMounts', [])
106
107
  return mounts
107
108
 
108
109
 
@@ -112,23 +113,24 @@ def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
112
113
  return search_res
113
114
 
114
115
 
115
- def get_pods(pods_text: str) -> list[str]:
116
+ def get_pods(pods_text: str) -> list[dict[str, str]]:
116
117
  pods_lines = pods_text.strip().split('\n')
117
- pods_lines = [line.split() for line in pods_lines]
118
+ pods_lines_tokenized = [line.split() for line in pods_lines]
118
119
  return [
119
120
  {
120
- 'Name': line[0],
121
- 'Status': line[2],
121
+ 'Name': tokens[0],
122
+ 'Status': tokens[2],
122
123
  }
123
- for line in pods_lines
124
+ for tokens in pods_lines_tokenized
124
125
  ]
125
126
 
126
127
 
127
128
  def get_script_name(job_yaml: dict) -> str | None:
128
- return (
129
+ return cast(
130
+ str | None,
129
131
  job_yaml.get('metadata', {})
130
132
  .get('annotations', {})
131
- .get('kjobctl.x-k8s.io/script', '')
133
+ .get('kjobctl.x-k8s.io/script', ''),
132
134
  )
133
135
 
134
136
 
@@ -33,6 +33,7 @@ from ..core.resources import get_cluster_capacity_type, get_cluster_system_chara
33
33
  def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
34
34
  gpu_type = get_gpu_type_from_cluster(args)
35
35
 
36
+ annotations: tuple
36
37
  if gpu_type == H100_MEGA_DEVICE_TYPE:
37
38
  annotations = get_a3mega_pod_template_annotations(args)
38
39
  elif gpu_type == H200_DEVICE_TYPE:
@@ -40,7 +41,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
40
41
  elif gpu_type == B200_DEVICE_TYPE:
41
42
  annotations = get_a4_pod_template_annotations(args)
42
43
  else:
43
- annotations = []
44
+ annotations = tuple()
44
45
 
45
46
  flags = [
46
47
  f" --pod-template-annotation {annotation} " for annotation in annotations
xpk/commands/storage.py CHANGED
@@ -141,7 +141,7 @@ def storage_delete(args: Namespace) -> None:
141
141
 
142
142
  def storage_attach(args: Namespace) -> None:
143
143
  add_zone_and_project(args)
144
- manifest = [{}]
144
+ manifest: list[dict] = [{}]
145
145
  if args.type == GCP_FILESTORE_TYPE:
146
146
  if args.instance is None:
147
147
  args.instance = args.name
xpk/commands/workload.py CHANGED
@@ -84,6 +84,7 @@ from ..core.system_characteristics import (
84
84
  from ..core.vertex import create_vertex_experiment
85
85
  from ..core.workload import (
86
86
  check_if_workload_exists,
87
+ get_jobsets_list_gcp_link,
87
88
  get_workload_list,
88
89
  wait_for_job_completion,
89
90
  zone_to_region,
@@ -226,7 +227,8 @@ spec:
226
227
  metadata:
227
228
  labels:
228
229
  xpk.google.com/workload: {args.workload}
229
- annotations: {annotations}
230
+ annotations:
231
+ {annotations}
230
232
  spec:
231
233
  priorityClassName: {args.priority}
232
234
  restartPolicy: Never
@@ -319,7 +321,7 @@ def workload_create(args) -> None:
319
321
  xpk_print('Starting workload create', flush=True)
320
322
  system, return_code = get_system_characteristics(args)
321
323
 
322
- if return_code > 0:
324
+ if return_code > 0 or system is None:
323
325
  xpk_print('Fetching system characteristics failed!')
324
326
  xpk_exit(return_code)
325
327
 
@@ -345,7 +347,7 @@ def workload_create(args) -> None:
345
347
  ):
346
348
  xpk_print(
347
349
  'Warning: Cluster has been created using XPK version:'
348
- f' {cluster_config_map["xpk_version"]} but the XPK version you are'
350
+ f' {cluster_xpk_version} but the XPK version you are'
349
351
  f' using to schedule workload is: {XPK_CURRENT_VERSION}. Some features'
350
352
  ' might not be available for this cluster. We recommend to'
351
353
  ' upgrade/downgrade your XPK version or cluster by running `xpk'
@@ -354,7 +356,7 @@ def workload_create(args) -> None:
354
356
 
355
357
  debugging_dashboard_id = None
356
358
 
357
- tensorboard_config = {}
359
+ tensorboard_config: dict | None = {}
358
360
  if VERTEX_TENSORBOARD_FEATURE_FLAG and args.use_vertex_tensorboard:
359
361
  tensorboard_config = create_vertex_experiment(args)
360
362
  # exit if failed to create Experiment in Vertex AI
@@ -450,8 +452,8 @@ def workload_create(args) -> None:
450
452
  - action: FailJobSet
451
453
  onJobFailureReasons:
452
454
  - PodFailurePolicy"""
453
- restart_on_exit_codes = get_restart_exit_codes(args)
454
- restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes))
455
+ restart_on_exit_codes_list = get_restart_exit_codes(args)
456
+ restart_on_exit_codes = ','.join(map(str, restart_on_exit_codes_list))
455
457
  pod_failure_policy = f"""
456
458
  podFailurePolicy:
457
459
  rules:
@@ -760,4 +762,8 @@ def workload_list(args) -> None:
760
762
  xpk_print(f'List Job request returned ERROR {return_code}')
761
763
  xpk_exit(return_code)
762
764
  xpk_print(f'Workload List Output:\n{return_value}')
765
+
766
+ workload_list_gcp_link = get_jobsets_list_gcp_link(project=args.project)
767
+ xpk_print(f'See your workloads in Cloud Console: {workload_list_gcp_link}')
768
+
763
769
  xpk_exit(0)
@@ -34,7 +34,7 @@ from ..system_characteristics import get_system_characteristics_by_device_type
34
34
  from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
35
35
  from ..kueue import KUEUE_VERSION
36
36
 
37
- yaml = yaml.YAML()
37
+ yaml_parser = yaml.YAML()
38
38
 
39
39
  a3high_device_type = H100_DEVICE_TYPE
40
40
  a3mega_device_type = H100_MEGA_DEVICE_TYPE
@@ -52,7 +52,7 @@ blueprint_dependencies_dir = {
52
52
  }
53
53
 
54
54
  cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
55
- cluster_toolkit_version = "v1.57.1"
55
+ cluster_toolkit_version = "v1.62.2"
56
56
 
57
57
 
58
58
  class BlueprintGeneratorOutput:
@@ -1019,7 +1019,7 @@ class BlueprintGenerator:
1019
1019
  ) -> str:
1020
1020
  blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
1021
1021
  with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
1022
- yaml.dump(xpk_blueprint, blueprint_file)
1022
+ yaml_parser.dump(xpk_blueprint, blueprint_file)
1023
1023
  return blueprint_path
1024
1024
 
1025
1025
  def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
@@ -1033,7 +1033,7 @@ class BlueprintGenerator:
1033
1033
  ensure_directory_exists(storage_path_with_prefix)
1034
1034
  return storage_path_with_prefix
1035
1035
 
1036
- def blueprint_exists(self, blueprint_name, prefix: str = ""):
1036
+ def blueprint_exists(self, blueprint_name, prefix: str = "") -> bool:
1037
1037
  blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
1038
1038
  return os.path.exists(blueprint_path)
1039
1039
 
@@ -1061,6 +1061,6 @@ class BlueprintGenerator:
1061
1061
  }
1062
1062
 
1063
1063
 
1064
- yaml.register_class(Blueprint)
1065
- yaml.register_class(DeploymentGroup)
1066
- yaml.register_class(DeploymentModule)
1064
+ yaml_parser.register_class(Blueprint)
1065
+ yaml_parser.register_class(DeploymentGroup)
1066
+ yaml_parser.register_class(DeploymentModule)