xpk 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. xpk/commands/batch.py +17 -10
  2. xpk/commands/cluster.py +137 -123
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +13 -27
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +22 -11
  8. xpk/commands/job.py +53 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +26 -2
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +58 -30
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +96 -195
  20. xpk/core/cluster_private.py +9 -12
  21. xpk/core/commands.py +21 -25
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +17 -9
  24. xpk/core/docker_resources.py +9 -4
  25. xpk/core/gcloud_context.py +26 -2
  26. xpk/core/gcloud_context_test.py +96 -0
  27. xpk/core/gcluster_manager.py +0 -3
  28. xpk/core/jobset.py +5 -8
  29. xpk/core/kjob.py +19 -29
  30. xpk/core/kueue_manager.py +383 -0
  31. xpk/core/kueue_manager_test.py +542 -0
  32. xpk/core/monitoring.py +1 -1
  33. xpk/core/nap.py +11 -16
  34. xpk/core/network.py +18 -19
  35. xpk/core/nodepool.py +65 -71
  36. xpk/core/nodepool_test.py +198 -1
  37. xpk/core/pathways.py +9 -5
  38. xpk/core/ray.py +11 -15
  39. xpk/core/resources.py +15 -10
  40. xpk/core/scheduling.py +23 -1
  41. xpk/core/scheduling_test.py +31 -0
  42. xpk/core/system_characteristics.py +335 -229
  43. xpk/core/vertex.py +1 -1
  44. xpk/core/workload.py +7 -8
  45. xpk/main.py +3 -2
  46. xpk/parser/cluster.py +50 -0
  47. xpk/parser/cluster_test.py +66 -0
  48. xpk/parser/common.py +11 -0
  49. xpk/parser/workload.py +62 -25
  50. xpk/parser/workload_test.py +82 -0
  51. xpk/utils/execution_context.py +28 -0
  52. xpk/utils/feature_flags.py +28 -0
  53. xpk/utils/file.py +25 -10
  54. xpk/utils/kueue.py +20 -0
  55. xpk/utils/network.py +4 -0
  56. xpk/utils/templates.py +2 -0
  57. xpk/utils/topology.py +37 -0
  58. xpk/utils/topology_test.py +43 -0
  59. xpk/utils/validation.py +79 -55
  60. xpk/utils/validation_test.py +37 -0
  61. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  62. xpk-0.14.0.dist-info/RECORD +112 -0
  63. xpk/core/kueue.py +0 -545
  64. xpk-0.12.0.dist-info/RECORD +0 -100
  65. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  66. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.12.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/kind.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from ..core.kueue_manager import (KueueConfig, KueueManager)
17
18
  from ..core.commands import (
18
19
  run_command_for_value,
19
20
  run_command_with_updates,
@@ -24,17 +25,14 @@ from ..core.kjob import (
24
25
  prepare_kjob,
25
26
  apply_kjob_crds,
26
27
  )
27
- from ..core.kueue import (
28
- install_kueue_on_cluster,
29
- install_kueue_crs,
30
- wait_for_kueue_available,
31
- )
28
+ from ..core.scheduling import get_total_chips_requested_from_args
32
29
  from ..core.storage import install_storage_crd
33
30
  from ..core.system_characteristics import (
34
31
  SystemCharacteristics,
35
32
  AcceleratorType,
36
33
  )
37
34
  from ..utils.console import (xpk_exit, xpk_print)
35
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
38
36
 
39
37
 
40
38
  def cluster_create(args) -> None:
@@ -46,6 +44,12 @@ def cluster_create(args) -> None:
46
44
  Returns:
47
45
  0 if successful and 1 otherwise.
48
46
  """
47
+ if should_validate_dependencies(args):
48
+ validate_dependencies_list([
49
+ SystemDependency.KUBECTL,
50
+ SystemDependency.KJOB,
51
+ SystemDependency.GCLOUD,
52
+ ])
49
53
  xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
50
54
 
51
55
  create_cluster_command_code = create_cluster_if_necessary(args)
@@ -64,18 +68,13 @@ def cluster_create(args) -> None:
64
68
  if set_jobset_on_cluster_code != 0:
65
69
  xpk_exit(set_jobset_on_cluster_code)
66
70
 
67
- xpk_print('Enabling Kueue on the cluster')
68
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
69
- if install_kueue_on_cluster_code != 0:
70
- xpk_exit(install_kueue_on_cluster_code)
71
-
72
71
  xpk_print('Verifying kjob installation')
73
- err_code = verify_kjob_installed(args)
72
+ err_code = verify_kjob_installed()
74
73
  if err_code > 0:
75
74
  xpk_exit(err_code)
76
75
 
77
76
  xpk_print('Applying kjob CDRs')
78
- err_code = apply_kjob_crds(args)
77
+ err_code = apply_kjob_crds()
79
78
  if err_code > 0:
80
79
  xpk_exit(err_code)
81
80
 
@@ -87,11 +86,6 @@ def cluster_create(args) -> None:
87
86
  k8s_client = setup_k8s_env(args)
88
87
  install_storage_crd(k8s_client)
89
88
 
90
- xpk_print('Wait for Kueue to be fully available')
91
- wait_for_kueue_available_code = wait_for_kueue_available(args)
92
- if wait_for_kueue_available_code != 0:
93
- xpk_exit(wait_for_kueue_available_code)
94
-
95
89
  args.num_slices = 1
96
90
  args.enable_pathways = False
97
91
  system = SystemCharacteristics(
@@ -102,12 +96,22 @@ def cluster_create(args) -> None:
102
96
  1,
103
97
  AcceleratorType['CPU'],
104
98
  'kind',
99
+ supports_sub_slicing=False,
105
100
  )
106
101
 
107
- xpk_print('Install Kueue Custom Resources')
108
- enable_kueue_credentials_code = install_kueue_crs(args, system, None)
109
- if enable_kueue_credentials_code != 0:
110
- xpk_exit(enable_kueue_credentials_code)
102
+ kueue_manager = KueueManager()
103
+ kueue_manager.install_or_upgrade(
104
+ KueueConfig(
105
+ system,
106
+ total_chips=get_total_chips_requested_from_args(args, system),
107
+ autoprovisioning_enabled=False,
108
+ num_slices=args.num_slices,
109
+ memory_limit='',
110
+ cpu_limit=0,
111
+ is_pathways_cluster=False,
112
+ flex=False,
113
+ ),
114
+ )
111
115
 
112
116
  xpk_print('Kind commands done! Resources are created.')
113
117
  xpk_exit(0)
@@ -122,6 +126,8 @@ def cluster_delete(args) -> None:
122
126
  Returns:
123
127
  0 if successful and 1 otherwise.
124
128
  """
129
+ if should_validate_dependencies(args):
130
+ validate_dependencies_list([SystemDependency.GCLOUD])
125
131
  xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
126
132
 
127
133
  run_kind_cluster_delete_command_code = run_kind_cluster_delete_command(args)
@@ -134,13 +140,12 @@ def cluster_delete(args) -> None:
134
140
  def cluster_list(args) -> None:
135
141
  """Function around cluster list.
136
142
 
137
- Args:
138
- args: user provided arguments for running the command.
139
-
140
143
  Returns:
141
144
  0 if successful and 1 otherwise.
142
145
  """
143
- if run_kind_clusters_list_command(args):
146
+ if should_validate_dependencies(args):
147
+ validate_dependencies_list([SystemDependency.GCLOUD])
148
+ if run_kind_clusters_list_command():
144
149
  xpk_exit(1)
145
150
  xpk_exit(0)
146
151
 
@@ -154,7 +159,7 @@ def create_cluster_if_necessary(args) -> int:
154
159
  Returns:
155
160
  0 if successful and 1 otherwise.
156
161
  """
157
- all_clusters, return_code = get_all_local_clusters_programmatic(args)
162
+ all_clusters, return_code = get_all_local_clusters_programmatic()
158
163
  if return_code > 0:
159
164
  xpk_print('Listing all clusters failed!')
160
165
  return 1
@@ -179,7 +184,7 @@ def run_kind_cluster_delete_command(args) -> int:
179
184
  if args.cluster:
180
185
  command += f' --name={args.cluster}'
181
186
 
182
- return_code = run_command_with_updates(command, 'Cluster Delete', args)
187
+ return_code = run_command_with_updates(command, 'Cluster Delete')
183
188
  if return_code != 0:
184
189
  xpk_print(f'Cluster delete request returned ERROR {return_code}')
185
190
  return 1
@@ -187,17 +192,14 @@ def run_kind_cluster_delete_command(args) -> int:
187
192
  return 0
188
193
 
189
194
 
190
- def run_kind_clusters_list_command(args) -> int:
195
+ def run_kind_clusters_list_command() -> int:
191
196
  """List Kind Clusters within the project and location.
192
197
 
193
- Args:
194
- args: user provided arguments for running the command.
195
-
196
198
  Returns:
197
199
  0 if successful and 1 otherwise.
198
200
  """
199
201
  command = 'kind get clusters'
200
- return_code = run_command_with_updates(command, 'Cluster List', args)
202
+ return_code = run_command_with_updates(command, 'Cluster List')
201
203
  if return_code != 0:
202
204
  xpk_print(f'Cluster list request returned ERROR {return_code}')
203
205
  return 1
@@ -222,25 +224,22 @@ def run_kind_cluster_create_command(args) -> int:
222
224
  if args.k8s_version:
223
225
  command += f' --image=kindest/node:v{args.k8s_version}'
224
226
 
225
- return_code = run_command_with_updates(command, 'Kind Cluster Create', args)
227
+ return_code = run_command_with_updates(command, 'Kind Cluster Create')
226
228
  if return_code != 0:
227
229
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
228
230
  return 1
229
231
  return 0
230
232
 
231
233
 
232
- def get_all_local_clusters_programmatic(args) -> tuple[list[str], int]:
234
+ def get_all_local_clusters_programmatic() -> tuple[list[str], int]:
233
235
  """Gets all the local clusters.
234
236
 
235
- Args:
236
- args: user provided arguments for running the command.
237
-
238
237
  Returns:
239
238
  List of cluster names and 0 if successful and 1 otherwise.
240
239
  """
241
240
  command = 'kind get clusters'
242
241
  return_code, raw_cluster_output = run_command_for_value(
243
- command, 'Find if Cluster Exists', args
242
+ command, 'Find if Cluster Exists'
244
243
  )
245
244
  if return_code != 0:
246
245
  xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
@@ -261,7 +260,7 @@ def set_local_cluster_command(args) -> int:
261
260
  if not args.cluster:
262
261
  command = 'kubectl config current-context'
263
262
  return_code, current_context = run_command_for_value(
264
- command, 'get current-context', args
263
+ command, 'get current-context'
265
264
  )
266
265
  xpk_print(
267
266
  'No local cluster name specified. Using current-context'
@@ -276,7 +275,6 @@ def set_local_cluster_command(args) -> int:
276
275
  return_code = run_command_with_updates(
277
276
  command,
278
277
  task,
279
- args,
280
278
  )
281
279
  if return_code != 0:
282
280
  xpk_print(f'{task} returned ERROR {return_code}')
@@ -35,11 +35,11 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
35
35
 
36
36
  annotations: tuple
37
37
  if gpu_type == H100_MEGA_DEVICE_TYPE:
38
- annotations = get_a3mega_pod_template_annotations(args)
38
+ annotations = get_a3mega_pod_template_annotations()
39
39
  elif gpu_type == H200_DEVICE_TYPE:
40
- annotations = get_a3ultra_pod_template_annotations(args)
40
+ annotations = get_a3ultra_pod_template_annotations()
41
41
  elif gpu_type == B200_DEVICE_TYPE:
42
- annotations = get_a4_pod_template_annotations(args)
42
+ annotations = get_a4_pod_template_annotations()
43
43
  else:
44
44
  annotations = tuple()
45
45
 
@@ -54,7 +54,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
54
54
  def add_TAS_annotations_to_command(args, cmd: str) -> str:
55
55
  system_characteristics = get_cluster_system_characteristics(args)
56
56
  capacity_type = get_cluster_capacity_type(args)
57
- if is_TAS_possible(system_characteristics, capacity_type, flex=False):
57
+ if is_TAS_possible(system_characteristics, capacity_type):
58
58
  cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
59
59
 
60
60
  return cmd
xpk/commands/run.py CHANGED
@@ -28,8 +28,9 @@ from ..core.kjob import (
28
28
  get_storage_annotations,
29
29
  prepare_kjob,
30
30
  )
31
- from ..core.kueue import LOCAL_QUEUE_NAME
31
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
32
32
  from ..utils.console import xpk_exit, xpk_print
33
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
33
34
  from .kind import set_local_cluster_command
34
35
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
35
36
 
@@ -42,6 +43,12 @@ def run(args: Namespace) -> None:
42
43
  Returns:
43
44
  None
44
45
  """
46
+ if should_validate_dependencies(args):
47
+ validate_dependencies_list([
48
+ SystemDependency.KUBECTL,
49
+ SystemDependency.KJOB,
50
+ SystemDependency.GCLOUD,
51
+ ])
45
52
  if not args.kind_cluster:
46
53
  add_zone_and_project(args)
47
54
  get_cluster_credentials(args)
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
126
133
  if args.time is not None:
127
134
  cmd += f' --time {args.time}'
128
135
 
129
- return_code = run_command_with_full_controls(cmd, 'run task', args)
136
+ return_code = run_command_with_full_controls(cmd, 'run task')
130
137
 
131
138
  if return_code != 0:
132
139
  xpk_print(f'Running task returned ERROR {return_code}')
xpk/commands/shell.py CHANGED
@@ -14,6 +14,7 @@ limitations under the License.
14
14
  from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15
15
  from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
16
16
  from ..utils.console import xpk_exit, xpk_print
17
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
17
18
  from argparse import Namespace
18
19
 
19
20
  from ..core.kjob import (
@@ -33,14 +34,18 @@ def shell(args: Namespace):
33
34
  Returns:
34
35
  0 if successful and 1 otherwise.
35
36
  """
37
+ if should_validate_dependencies(args):
38
+ validate_dependencies_list([
39
+ SystemDependency.KUBECTL,
40
+ SystemDependency.KJOB,
41
+ SystemDependency.GCLOUD,
42
+ ])
36
43
  exisitng_shell_pod_name = get_existing_shell_pod_name(args)
37
44
 
38
45
  if exisitng_shell_pod_name is None:
39
46
  return_code = connect_to_new_interactive_shell(args)
40
47
  else:
41
- return_code = connect_to_existing_interactive_shell(
42
- exisitng_shell_pod_name, args
43
- )
48
+ return_code = connect_to_existing_interactive_shell(exisitng_shell_pod_name)
44
49
 
45
50
  if return_code != 0:
46
51
  xpk_print(f'The command failed with code {return_code}.')
@@ -60,7 +65,6 @@ def get_existing_shell_pod_name(args: Namespace) -> str | None:
60
65
  ' -o custom-columns=":metadata.name"'
61
66
  ),
62
67
  task='Get existing interactive shell pod name.',
63
- global_args=args,
64
68
  )
65
69
  if return_code != 0:
66
70
  xpk_print(
@@ -95,21 +99,17 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
95
99
  return run_command_with_full_controls(
96
100
  command=cmd,
97
101
  task='Creating new interactive shell and entering it',
98
- global_args=args,
99
102
  instructions=exit_instructions,
100
103
  )
101
104
 
102
105
 
103
- def connect_to_existing_interactive_shell(
104
- pod_name: str, args: Namespace
105
- ) -> int:
106
+ def connect_to_existing_interactive_shell(pod_name: str) -> int:
106
107
  return run_command_with_full_controls(
107
108
  command=(
108
109
  f'kubectl exec --stdin --tty {pod_name} --'
109
110
  f' {get_pod_template_interactive_command()}'
110
111
  ),
111
112
  task='Entering existing interactive shell',
112
- global_args=args,
113
113
  instructions=exit_instructions,
114
114
  )
115
115
 
@@ -121,6 +121,10 @@ def shell_stop(args: Namespace):
121
121
  Returns:
122
122
  0 if successful and 1 otherwise.
123
123
  """
124
+ if should_validate_dependencies(args):
125
+ validate_dependencies_list(
126
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
127
+ )
124
128
  exisitng_shell_pod_name = get_existing_shell_pod_name(args)
125
129
 
126
130
  if exisitng_shell_pod_name is None:
@@ -130,7 +134,6 @@ def shell_stop(args: Namespace):
130
134
  return_code = run_command_with_updates(
131
135
  command=f'kubectl delete pod {exisitng_shell_pod_name}',
132
136
  task='Deleting the existing shell.',
133
- global_args=args,
134
137
  )
135
138
  if return_code != 0:
136
139
  xpk_exit(return_code)
xpk/commands/storage.py CHANGED
@@ -58,9 +58,15 @@ from ..core.storage import (
58
58
  )
59
59
  from ..utils.console import get_user_input, xpk_exit, xpk_print
60
60
  from ..utils.kubectl import apply_kubectl_manifest
61
+ from ..utils.execution_context import is_dry_run
62
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
61
63
 
62
64
 
63
65
  def storage_create(args: Namespace) -> None:
66
+ if should_validate_dependencies(args):
67
+ validate_dependencies_list(
68
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
69
+ )
64
70
  add_zone_and_project(args)
65
71
  if args.type == GCP_FILESTORE_TYPE:
66
72
  if args.instance is None:
@@ -106,6 +112,10 @@ def storage_create(args: Namespace) -> None:
106
112
 
107
113
 
108
114
  def storage_delete(args: Namespace) -> None:
115
+ if should_validate_dependencies(args):
116
+ validate_dependencies_list(
117
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
118
+ )
109
119
  add_zone_and_project(args)
110
120
  k8s_api_client = setup_k8s_env(args)
111
121
  storages = list_storages(k8s_api_client)
@@ -140,6 +150,10 @@ def storage_delete(args: Namespace) -> None:
140
150
 
141
151
 
142
152
  def storage_attach(args: Namespace) -> None:
153
+ if should_validate_dependencies(args):
154
+ validate_dependencies_list(
155
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
156
+ )
143
157
  add_zone_and_project(args)
144
158
  manifest: list[dict] = [{}]
145
159
  if args.type == GCP_FILESTORE_TYPE:
@@ -243,12 +257,22 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
243
257
 
244
258
 
245
259
  def storage_list(args: Namespace) -> None:
246
- k8s_api_client = setup_k8s_env(args)
247
- storages = list_storages(k8s_api_client)
260
+ if should_validate_dependencies(args):
261
+ validate_dependencies_list(
262
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
263
+ )
264
+ storages = []
265
+ if not is_dry_run():
266
+ k8s_api_client = setup_k8s_env(args)
267
+ storages = list_storages(k8s_api_client)
248
268
  print_storages_for_cluster(storages)
249
269
 
250
270
 
251
271
  def storage_detach(args: Namespace) -> None:
272
+ if should_validate_dependencies(args):
273
+ validate_dependencies_list(
274
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
275
+ )
252
276
  k8s_api_client = setup_k8s_env(args)
253
277
  storage = get_storage(k8s_api_client, args.name)
254
278
  delete_storage_resources(k8s_api_client, storage)
xpk/commands/version.py CHANGED
@@ -18,10 +18,6 @@ from ..core.config import __version__
18
18
  from ..utils.console import xpk_print
19
19
 
20
20
 
21
- def get_xpk_version() -> str:
22
- return __version__
23
-
24
-
25
21
  def version(args) -> None: # pylint: disable=unused-argument
26
22
  """Get version of xpk."""
27
23
  xpk_print('xpk_version:', __version__)
xpk/commands/workload.py CHANGED
@@ -34,7 +34,7 @@ from ..core.docker_container import (
34
34
  )
35
35
  from ..core.docker_resources import get_volumes, parse_env_config
36
36
  from ..core.gcloud_context import add_zone_and_project
37
- from ..core.kueue import LOCAL_QUEUE_NAME
37
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
38
38
  from ..core.monitoring import get_gke_outlier_dashboard
39
39
  from ..core.nap import (
40
40
  get_autoprovisioning_node_selector_args,
@@ -53,9 +53,6 @@ from ..core.pathways import (
53
53
  try_to_delete_pathwaysjob_first,
54
54
  )
55
55
  from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
56
- from ..core.capacity import (
57
- CapacityType,
58
- )
59
56
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
60
57
  from ..core.scheduling import (
61
58
  check_if_workload_can_schedule,
@@ -65,6 +62,7 @@ from ..core.scheduling import (
65
62
  create_tpu_topology,
66
63
  get_cpu_affinity,
67
64
  get_gpu_scheduler,
65
+ create_sub_slicing_annotations,
68
66
  )
69
67
  from ..core.storage import (
70
68
  GCE_PD_TYPE,
@@ -87,7 +85,7 @@ from ..core.workload import (
87
85
  get_jobsets_list_gcp_link,
88
86
  get_workload_list,
89
87
  wait_for_job_completion,
90
- zone_to_region,
88
+ get_cluster_location,
91
89
  )
92
90
  from ..core.workload_decorators import (
93
91
  rdma_decorator,
@@ -97,8 +95,11 @@ from ..core.workload_decorators import (
97
95
  )
98
96
  from ..utils.console import get_user_input, xpk_exit, xpk_print
99
97
  from ..utils.file import write_tmp_file
98
+ from ..utils.execution_context import is_dry_run
99
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
100
100
  from . import cluster_gcluster
101
101
  from .common import is_TAS_possible
102
+ from ..utils.feature_flags import FeatureFlags
102
103
 
103
104
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
104
105
  kind: JobSet
@@ -129,6 +130,7 @@ spec:
129
130
  xpk.google.com/workload: {args.workload}
130
131
  annotations:
131
132
  {storage_annotations}
133
+ {sub_slicing_annotations}
132
134
  spec:
133
135
  schedulerName: {args.scheduler}
134
136
  imagePullSecrets:
@@ -266,6 +268,8 @@ PW_WORKLOAD_CREATE_YAML = """
266
268
  maxSliceRestarts: {args.max_slice_restarts}
267
269
  terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
268
270
  priorityClassName: {args.priority}
271
+ nodeSelector:
272
+ {autoprovisioning_args}
269
273
  pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
270
274
  controller:
271
275
  # #Pod template for training, default mode.
@@ -306,8 +310,16 @@ def workload_create(args) -> None:
306
310
  Returns:
307
311
  0 if successful and 1 otherwise.
308
312
  """
309
- k8s_api_client = setup_k8s_env(args)
310
- setup_k8s_service_accounts()
313
+ if should_validate_dependencies(args):
314
+ validate_dependencies_list([
315
+ SystemDependency.KUBECTL,
316
+ SystemDependency.GCLOUD,
317
+ SystemDependency.DOCKER,
318
+ ])
319
+ k8s_api_client = None
320
+ if not is_dry_run():
321
+ k8s_api_client = setup_k8s_env(args)
322
+ setup_k8s_service_accounts()
311
323
 
312
324
  workload_exists = check_if_workload_exists(args)
313
325
 
@@ -331,7 +343,7 @@ def workload_create(args) -> None:
331
343
  xpk_print('Starting workload create', flush=True)
332
344
 
333
345
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
334
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
346
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
335
347
  cluster_xpk_version = None
336
348
  if cluster_config_map is None:
337
349
  xpk_print(
@@ -383,8 +395,10 @@ def workload_create(args) -> None:
383
395
  all_storages = []
384
396
  # Currently storage customization is not supported for Pathways workloads. b/408468941
385
397
  if not args.use_pathways:
386
- storages: list[Storage] = get_storages_to_mount(
387
- k8s_api_client, args.storage
398
+ storages: list[Storage] = (
399
+ []
400
+ if k8s_api_client is None
401
+ else get_storages_to_mount(k8s_api_client, args.storage)
388
402
  )
389
403
  gcs_fuse_storages = list(
390
404
  filter(lambda storage: storage.type == GCS_FUSE_TYPE, storages)
@@ -477,16 +491,12 @@ def workload_create(args) -> None:
477
491
  capacity_type = get_cluster_capacity_type(args)
478
492
 
479
493
  annotations = (
480
- ''
481
- if not is_TAS_possible(
482
- system_characteristics,
483
- capacity_type,
484
- flex=True if capacity_type == CapacityType.FLEX_START else False,
485
- )
486
- else (
494
+ (
487
495
  'kueue.x-k8s.io/podset-preferred-topology:'
488
496
  ' "cloud.google.com/gce-topology-host"'
489
497
  )
498
+ if is_TAS_possible(system_characteristics, capacity_type)
499
+ else ''
490
500
  )
491
501
 
492
502
  if (
@@ -502,7 +512,7 @@ def workload_create(args) -> None:
502
512
  annotations=annotations,
503
513
  )
504
514
 
505
- sub_networks = get_cluster_subnetworks(args)
515
+ sub_networks = get_cluster_subnetworks()
506
516
  if args.device_type == a3high_device_type:
507
517
  yml_string = tcpx_decorator.decorate_jobset(yml_string)
508
518
  elif args.device_type == a3mega_device_type:
@@ -540,6 +550,7 @@ def workload_create(args) -> None:
540
550
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
541
551
  user_workload=get_user_workload_for_pathways(args, system),
542
552
  local_queue_name=LOCAL_QUEUE_NAME,
553
+ autoprovisioning_args=autoprovisioning_args,
543
554
  )
544
555
  else:
545
556
  container, debugging_dashboard_id = get_user_workload_container(
@@ -553,6 +564,14 @@ def workload_create(args) -> None:
553
564
  accelerator_label=create_accelerator_label(
554
565
  system.accelerator_type, system
555
566
  ),
567
+ sub_slicing_annotations=(
568
+ ''
569
+ if not FeatureFlags.SUB_SLICING_ENABLED
570
+ or args.sub_slicing_topology is None
571
+ else ('\n' + (' ' * 16)).join(
572
+ create_sub_slicing_annotations(args.sub_slicing_topology)
573
+ )
574
+ ),
556
575
  machine_label=create_machine_label(system.accelerator_type, system),
557
576
  local_queue_name=LOCAL_QUEUE_NAME,
558
577
  autoprovisioning_args=autoprovisioning_args,
@@ -569,14 +588,14 @@ def workload_create(args) -> None:
569
588
  pod_failure_policy=pod_failure_policy,
570
589
  )
571
590
  tmp = write_tmp_file(yml_string)
572
- command = f'kubectl apply -f {str(tmp.file.name)}'
573
- return_code = run_command_with_updates(command, 'Creating Workload', args)
591
+ command = f'kubectl apply -f {str(tmp)}'
592
+ return_code = run_command_with_updates(command, 'Creating Workload')
574
593
 
575
594
  if return_code != 0:
576
595
  xpk_print(f'Create Workload request returned ERROR {return_code}')
577
596
  xpk_exit(return_code)
578
597
 
579
- if not args.use_pathways:
598
+ if not args.use_pathways and not is_dry_run():
580
599
  add_bucket_iam_members(args, storages)
581
600
 
582
601
  # Get GKE outlier dashboard for TPU
@@ -617,7 +636,9 @@ def workload_create(args) -> None:
617
636
  ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
618
637
  " python -c 'import pathwaysutils; import jax; print(jax.devices())'"
619
638
  )
620
- pathways_proxy_link = f'https://console.cloud.google.com/kubernetes/job/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
639
+ pathways_proxy_link = (
640
+ f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
641
+ )
621
642
  xpk_print(
622
643
  'Follow the proxy here:'
623
644
  # pylint: disable=line-too-long)
@@ -631,7 +652,7 @@ def workload_create(args) -> None:
631
652
  xpk_print(
632
653
  'Follow your workload here:'
633
654
  # pylint: disable=line-too-long
634
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
655
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
635
656
  )
636
657
  duration_of_logs = 'P1D' # Past 1 Day
637
658
  xpk_print(
@@ -640,7 +661,7 @@ def workload_create(args) -> None:
640
661
  ' ([prefix]-slice-job-[slice_number]-[worker_number])'
641
662
  ' after clicking the url if you want other worker logs.'
642
663
  # pylint: disable=line-too-long
643
- f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{zone_to_region(args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
664
+ f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
644
665
  )
645
666
 
646
667
  xpk_exit(0)
@@ -673,6 +694,10 @@ def workload_delete(args) -> None:
673
694
  Returns:
674
695
  0 if successful and 1 otherwise.
675
696
  """
697
+ if should_validate_dependencies(args):
698
+ validate_dependencies_list(
699
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
700
+ )
676
701
  xpk_print('Starting Workload delete', flush=True)
677
702
  add_zone_and_project(args)
678
703
  get_cluster_credentials(args)
@@ -720,12 +745,13 @@ def workload_delete(args) -> None:
720
745
 
721
746
  # Not batching deletion for single workload
722
747
  if len(workloads) == 1:
723
- return_code = run_command_with_updates(
724
- commands[0], 'Delete Workload', args
725
- )
748
+ return_code = run_command_with_updates(commands[0], 'Delete Workload')
726
749
  else:
727
750
  return_code = run_commands(
728
- commands, 'Delete Workload', task_names, batch=100
751
+ commands,
752
+ 'Delete Workload',
753
+ task_names,
754
+ batch=100,
729
755
  )
730
756
 
731
757
  if return_code != 0:
@@ -743,8 +769,10 @@ def workload_list(args) -> None:
743
769
  Returns:
744
770
  0 if successful and 1 otherwise.
745
771
  """
746
- xpk_print(args)
747
-
772
+ if should_validate_dependencies(args):
773
+ validate_dependencies_list(
774
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
775
+ )
748
776
  xpk_print('Starting workload list', flush=True)
749
777
  add_zone_and_project(args)
750
778
  get_cluster_credentials(args)