xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. xpk/commands/batch.py +9 -2
  2. xpk/commands/cluster.py +128 -115
  3. xpk/commands/cluster_gcluster.py +77 -14
  4. xpk/commands/cluster_gcluster_test.py +177 -0
  5. xpk/commands/common.py +10 -28
  6. xpk/commands/info.py +11 -9
  7. xpk/commands/inspector.py +21 -10
  8. xpk/commands/job.py +25 -9
  9. xpk/commands/kind.py +38 -40
  10. xpk/commands/kjob_common.py +4 -4
  11. xpk/commands/run.py +9 -2
  12. xpk/commands/shell.py +13 -10
  13. xpk/commands/storage.py +21 -0
  14. xpk/commands/version.py +0 -4
  15. xpk/commands/workload.py +43 -22
  16. xpk/core/blueprint/blueprint_generator.py +4 -40
  17. xpk/core/blueprint/blueprint_test.py +0 -6
  18. xpk/core/capacity.py +6 -5
  19. xpk/core/cluster.py +91 -194
  20. xpk/core/cluster_private.py +6 -11
  21. xpk/core/commands.py +11 -18
  22. xpk/core/config.py +1 -1
  23. xpk/core/docker_image.py +3 -4
  24. xpk/core/gcloud_context.py +26 -2
  25. xpk/core/gcloud_context_test.py +96 -0
  26. xpk/core/gcluster_manager.py +0 -3
  27. xpk/core/jobset.py +4 -7
  28. xpk/core/kjob.py +14 -27
  29. xpk/core/kueue_manager.py +383 -0
  30. xpk/core/kueue_manager_test.py +542 -0
  31. xpk/core/monitoring.py +1 -1
  32. xpk/core/nap.py +10 -15
  33. xpk/core/network.py +17 -18
  34. xpk/core/nodepool.py +66 -77
  35. xpk/core/nodepool_test.py +198 -1
  36. xpk/core/pathways.py +5 -5
  37. xpk/core/ray.py +10 -14
  38. xpk/core/resources.py +6 -11
  39. xpk/core/scheduling.py +19 -1
  40. xpk/core/scheduling_test.py +31 -0
  41. xpk/core/system_characteristics.py +335 -229
  42. xpk/core/vertex.py +1 -1
  43. xpk/core/workload.py +7 -8
  44. xpk/main.py +2 -4
  45. xpk/parser/cluster.py +7 -0
  46. xpk/parser/cluster_test.py +66 -0
  47. xpk/parser/common.py +11 -0
  48. xpk/parser/workload.py +62 -25
  49. xpk/parser/workload_test.py +82 -0
  50. xpk/utils/feature_flags.py +28 -0
  51. xpk/utils/kueue.py +20 -0
  52. xpk/utils/templates.py +2 -0
  53. xpk/utils/topology.py +37 -0
  54. xpk/utils/topology_test.py +43 -0
  55. xpk/utils/validation.py +79 -55
  56. xpk/utils/validation_test.py +37 -0
  57. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
  58. xpk-0.14.0.dist-info/RECORD +112 -0
  59. xpk/core/kueue.py +0 -561
  60. xpk-0.13.0.dist-info/RECORD +0 -101
  61. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
  64. {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/kind.py CHANGED
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
+ from ..core.kueue_manager import (KueueConfig, KueueManager)
17
18
  from ..core.commands import (
18
19
  run_command_for_value,
19
20
  run_command_with_updates,
@@ -24,17 +25,14 @@ from ..core.kjob import (
24
25
  prepare_kjob,
25
26
  apply_kjob_crds,
26
27
  )
27
- from ..core.kueue import (
28
- install_kueue_on_cluster,
29
- install_kueue_crs,
30
- wait_for_kueue_available,
31
- )
28
+ from ..core.scheduling import get_total_chips_requested_from_args
32
29
  from ..core.storage import install_storage_crd
33
30
  from ..core.system_characteristics import (
34
31
  SystemCharacteristics,
35
32
  AcceleratorType,
36
33
  )
37
34
  from ..utils.console import (xpk_exit, xpk_print)
35
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
38
36
 
39
37
 
40
38
  def cluster_create(args) -> None:
@@ -46,6 +44,12 @@ def cluster_create(args) -> None:
46
44
  Returns:
47
45
  0 if successful and 1 otherwise.
48
46
  """
47
+ if should_validate_dependencies(args):
48
+ validate_dependencies_list([
49
+ SystemDependency.KUBECTL,
50
+ SystemDependency.KJOB,
51
+ SystemDependency.GCLOUD,
52
+ ])
49
53
  xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
50
54
 
51
55
  create_cluster_command_code = create_cluster_if_necessary(args)
@@ -64,18 +68,13 @@ def cluster_create(args) -> None:
64
68
  if set_jobset_on_cluster_code != 0:
65
69
  xpk_exit(set_jobset_on_cluster_code)
66
70
 
67
- xpk_print('Enabling Kueue on the cluster')
68
- install_kueue_on_cluster_code = install_kueue_on_cluster(args)
69
- if install_kueue_on_cluster_code != 0:
70
- xpk_exit(install_kueue_on_cluster_code)
71
-
72
71
  xpk_print('Verifying kjob installation')
73
- err_code = verify_kjob_installed(args)
72
+ err_code = verify_kjob_installed()
74
73
  if err_code > 0:
75
74
  xpk_exit(err_code)
76
75
 
77
76
  xpk_print('Applying kjob CDRs')
78
- err_code = apply_kjob_crds(args)
77
+ err_code = apply_kjob_crds()
79
78
  if err_code > 0:
80
79
  xpk_exit(err_code)
81
80
 
@@ -87,11 +86,6 @@ def cluster_create(args) -> None:
87
86
  k8s_client = setup_k8s_env(args)
88
87
  install_storage_crd(k8s_client)
89
88
 
90
- xpk_print('Wait for Kueue to be fully available')
91
- wait_for_kueue_available_code = wait_for_kueue_available(args)
92
- if wait_for_kueue_available_code != 0:
93
- xpk_exit(wait_for_kueue_available_code)
94
-
95
89
  args.num_slices = 1
96
90
  args.enable_pathways = False
97
91
  system = SystemCharacteristics(
@@ -102,12 +96,22 @@ def cluster_create(args) -> None:
102
96
  1,
103
97
  AcceleratorType['CPU'],
104
98
  'kind',
99
+ supports_sub_slicing=False,
105
100
  )
106
101
 
107
- xpk_print('Install Kueue Custom Resources')
108
- enable_kueue_credentials_code = install_kueue_crs(args, system, None)
109
- if enable_kueue_credentials_code != 0:
110
- xpk_exit(enable_kueue_credentials_code)
102
+ kueue_manager = KueueManager()
103
+ kueue_manager.install_or_upgrade(
104
+ KueueConfig(
105
+ system,
106
+ total_chips=get_total_chips_requested_from_args(args, system),
107
+ autoprovisioning_enabled=False,
108
+ num_slices=args.num_slices,
109
+ memory_limit='',
110
+ cpu_limit=0,
111
+ is_pathways_cluster=False,
112
+ flex=False,
113
+ ),
114
+ )
111
115
 
112
116
  xpk_print('Kind commands done! Resources are created.')
113
117
  xpk_exit(0)
@@ -122,6 +126,8 @@ def cluster_delete(args) -> None:
122
126
  Returns:
123
127
  0 if successful and 1 otherwise.
124
128
  """
129
+ if should_validate_dependencies(args):
130
+ validate_dependencies_list([SystemDependency.GCLOUD])
125
131
  xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
126
132
 
127
133
  run_kind_cluster_delete_command_code = run_kind_cluster_delete_command(args)
@@ -134,13 +140,12 @@ def cluster_delete(args) -> None:
134
140
  def cluster_list(args) -> None:
135
141
  """Function around cluster list.
136
142
 
137
- Args:
138
- args: user provided arguments for running the command.
139
-
140
143
  Returns:
141
144
  0 if successful and 1 otherwise.
142
145
  """
143
- if run_kind_clusters_list_command(args):
146
+ if should_validate_dependencies(args):
147
+ validate_dependencies_list([SystemDependency.GCLOUD])
148
+ if run_kind_clusters_list_command():
144
149
  xpk_exit(1)
145
150
  xpk_exit(0)
146
151
 
@@ -154,7 +159,7 @@ def create_cluster_if_necessary(args) -> int:
154
159
  Returns:
155
160
  0 if successful and 1 otherwise.
156
161
  """
157
- all_clusters, return_code = get_all_local_clusters_programmatic(args)
162
+ all_clusters, return_code = get_all_local_clusters_programmatic()
158
163
  if return_code > 0:
159
164
  xpk_print('Listing all clusters failed!')
160
165
  return 1
@@ -179,7 +184,7 @@ def run_kind_cluster_delete_command(args) -> int:
179
184
  if args.cluster:
180
185
  command += f' --name={args.cluster}'
181
186
 
182
- return_code = run_command_with_updates(command, 'Cluster Delete', args)
187
+ return_code = run_command_with_updates(command, 'Cluster Delete')
183
188
  if return_code != 0:
184
189
  xpk_print(f'Cluster delete request returned ERROR {return_code}')
185
190
  return 1
@@ -187,17 +192,14 @@ def run_kind_cluster_delete_command(args) -> int:
187
192
  return 0
188
193
 
189
194
 
190
- def run_kind_clusters_list_command(args) -> int:
195
+ def run_kind_clusters_list_command() -> int:
191
196
  """List Kind Clusters within the project and location.
192
197
 
193
- Args:
194
- args: user provided arguments for running the command.
195
-
196
198
  Returns:
197
199
  0 if successful and 1 otherwise.
198
200
  """
199
201
  command = 'kind get clusters'
200
- return_code = run_command_with_updates(command, 'Cluster List', args)
202
+ return_code = run_command_with_updates(command, 'Cluster List')
201
203
  if return_code != 0:
202
204
  xpk_print(f'Cluster list request returned ERROR {return_code}')
203
205
  return 1
@@ -222,25 +224,22 @@ def run_kind_cluster_create_command(args) -> int:
222
224
  if args.k8s_version:
223
225
  command += f' --image=kindest/node:v{args.k8s_version}'
224
226
 
225
- return_code = run_command_with_updates(command, 'Kind Cluster Create', args)
227
+ return_code = run_command_with_updates(command, 'Kind Cluster Create')
226
228
  if return_code != 0:
227
229
  xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
228
230
  return 1
229
231
  return 0
230
232
 
231
233
 
232
- def get_all_local_clusters_programmatic(args) -> tuple[list[str], int]:
234
+ def get_all_local_clusters_programmatic() -> tuple[list[str], int]:
233
235
  """Gets all the local clusters.
234
236
 
235
- Args:
236
- args: user provided arguments for running the command.
237
-
238
237
  Returns:
239
238
  List of cluster names and 0 if successful and 1 otherwise.
240
239
  """
241
240
  command = 'kind get clusters'
242
241
  return_code, raw_cluster_output = run_command_for_value(
243
- command, 'Find if Cluster Exists', args
242
+ command, 'Find if Cluster Exists'
244
243
  )
245
244
  if return_code != 0:
246
245
  xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
@@ -261,7 +260,7 @@ def set_local_cluster_command(args) -> int:
261
260
  if not args.cluster:
262
261
  command = 'kubectl config current-context'
263
262
  return_code, current_context = run_command_for_value(
264
- command, 'get current-context', args
263
+ command, 'get current-context'
265
264
  )
266
265
  xpk_print(
267
266
  'No local cluster name specified. Using current-context'
@@ -276,7 +275,6 @@ def set_local_cluster_command(args) -> int:
276
275
  return_code = run_command_with_updates(
277
276
  command,
278
277
  task,
279
- args,
280
278
  )
281
279
  if return_code != 0:
282
280
  xpk_print(f'{task} returned ERROR {return_code}')
@@ -35,11 +35,11 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
35
35
 
36
36
  annotations: tuple
37
37
  if gpu_type == H100_MEGA_DEVICE_TYPE:
38
- annotations = get_a3mega_pod_template_annotations(args)
38
+ annotations = get_a3mega_pod_template_annotations()
39
39
  elif gpu_type == H200_DEVICE_TYPE:
40
- annotations = get_a3ultra_pod_template_annotations(args)
40
+ annotations = get_a3ultra_pod_template_annotations()
41
41
  elif gpu_type == B200_DEVICE_TYPE:
42
- annotations = get_a4_pod_template_annotations(args)
42
+ annotations = get_a4_pod_template_annotations()
43
43
  else:
44
44
  annotations = tuple()
45
45
 
@@ -54,7 +54,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
54
54
  def add_TAS_annotations_to_command(args, cmd: str) -> str:
55
55
  system_characteristics = get_cluster_system_characteristics(args)
56
56
  capacity_type = get_cluster_capacity_type(args)
57
- if is_TAS_possible(system_characteristics, capacity_type, flex=False):
57
+ if is_TAS_possible(system_characteristics, capacity_type):
58
58
  cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
59
59
 
60
60
  return cmd
xpk/commands/run.py CHANGED
@@ -28,8 +28,9 @@ from ..core.kjob import (
28
28
  get_storage_annotations,
29
29
  prepare_kjob,
30
30
  )
31
- from ..core.kueue import LOCAL_QUEUE_NAME
31
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
32
32
  from ..utils.console import xpk_exit, xpk_print
33
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
33
34
  from .kind import set_local_cluster_command
34
35
  from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
35
36
 
@@ -42,6 +43,12 @@ def run(args: Namespace) -> None:
42
43
  Returns:
43
44
  None
44
45
  """
46
+ if should_validate_dependencies(args):
47
+ validate_dependencies_list([
48
+ SystemDependency.KUBECTL,
49
+ SystemDependency.KJOB,
50
+ SystemDependency.GCLOUD,
51
+ ])
45
52
  if not args.kind_cluster:
46
53
  add_zone_and_project(args)
47
54
  get_cluster_credentials(args)
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
126
133
  if args.time is not None:
127
134
  cmd += f' --time {args.time}'
128
135
 
129
- return_code = run_command_with_full_controls(cmd, 'run task', args)
136
+ return_code = run_command_with_full_controls(cmd, 'run task')
130
137
 
131
138
  if return_code != 0:
132
139
  xpk_print(f'Running task returned ERROR {return_code}')
xpk/commands/shell.py CHANGED
@@ -14,6 +14,7 @@ limitations under the License.
14
14
  from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15
15
  from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
16
16
  from ..utils.console import xpk_exit, xpk_print
17
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
17
18
  from argparse import Namespace
18
19
 
19
20
  from ..core.kjob import (
@@ -33,14 +34,18 @@ def shell(args: Namespace):
33
34
  Returns:
34
35
  0 if successful and 1 otherwise.
35
36
  """
37
+ if should_validate_dependencies(args):
38
+ validate_dependencies_list([
39
+ SystemDependency.KUBECTL,
40
+ SystemDependency.KJOB,
41
+ SystemDependency.GCLOUD,
42
+ ])
36
43
  exisitng_shell_pod_name = get_existing_shell_pod_name(args)
37
44
 
38
45
  if exisitng_shell_pod_name is None:
39
46
  return_code = connect_to_new_interactive_shell(args)
40
47
  else:
41
- return_code = connect_to_existing_interactive_shell(
42
- exisitng_shell_pod_name, args
43
- )
48
+ return_code = connect_to_existing_interactive_shell(exisitng_shell_pod_name)
44
49
 
45
50
  if return_code != 0:
46
51
  xpk_print(f'The command failed with code {return_code}.')
@@ -60,7 +65,6 @@ def get_existing_shell_pod_name(args: Namespace) -> str | None:
60
65
  ' -o custom-columns=":metadata.name"'
61
66
  ),
62
67
  task='Get existing interactive shell pod name.',
63
- global_args=args,
64
68
  )
65
69
  if return_code != 0:
66
70
  xpk_print(
@@ -95,21 +99,17 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
95
99
  return run_command_with_full_controls(
96
100
  command=cmd,
97
101
  task='Creating new interactive shell and entering it',
98
- global_args=args,
99
102
  instructions=exit_instructions,
100
103
  )
101
104
 
102
105
 
103
- def connect_to_existing_interactive_shell(
104
- pod_name: str, args: Namespace
105
- ) -> int:
106
+ def connect_to_existing_interactive_shell(pod_name: str) -> int:
106
107
  return run_command_with_full_controls(
107
108
  command=(
108
109
  f'kubectl exec --stdin --tty {pod_name} --'
109
110
  f' {get_pod_template_interactive_command()}'
110
111
  ),
111
112
  task='Entering existing interactive shell',
112
- global_args=args,
113
113
  instructions=exit_instructions,
114
114
  )
115
115
 
@@ -121,6 +121,10 @@ def shell_stop(args: Namespace):
121
121
  Returns:
122
122
  0 if successful and 1 otherwise.
123
123
  """
124
+ if should_validate_dependencies(args):
125
+ validate_dependencies_list(
126
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
127
+ )
124
128
  exisitng_shell_pod_name = get_existing_shell_pod_name(args)
125
129
 
126
130
  if exisitng_shell_pod_name is None:
@@ -130,7 +134,6 @@ def shell_stop(args: Namespace):
130
134
  return_code = run_command_with_updates(
131
135
  command=f'kubectl delete pod {exisitng_shell_pod_name}',
132
136
  task='Deleting the existing shell.',
133
- global_args=args,
134
137
  )
135
138
  if return_code != 0:
136
139
  xpk_exit(return_code)
xpk/commands/storage.py CHANGED
@@ -59,9 +59,14 @@ from ..core.storage import (
59
59
  from ..utils.console import get_user_input, xpk_exit, xpk_print
60
60
  from ..utils.kubectl import apply_kubectl_manifest
61
61
  from ..utils.execution_context import is_dry_run
62
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
62
63
 
63
64
 
64
65
  def storage_create(args: Namespace) -> None:
66
+ if should_validate_dependencies(args):
67
+ validate_dependencies_list(
68
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
69
+ )
65
70
  add_zone_and_project(args)
66
71
  if args.type == GCP_FILESTORE_TYPE:
67
72
  if args.instance is None:
@@ -107,6 +112,10 @@ def storage_create(args: Namespace) -> None:
107
112
 
108
113
 
109
114
  def storage_delete(args: Namespace) -> None:
115
+ if should_validate_dependencies(args):
116
+ validate_dependencies_list(
117
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
118
+ )
110
119
  add_zone_and_project(args)
111
120
  k8s_api_client = setup_k8s_env(args)
112
121
  storages = list_storages(k8s_api_client)
@@ -141,6 +150,10 @@ def storage_delete(args: Namespace) -> None:
141
150
 
142
151
 
143
152
  def storage_attach(args: Namespace) -> None:
153
+ if should_validate_dependencies(args):
154
+ validate_dependencies_list(
155
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
156
+ )
144
157
  add_zone_and_project(args)
145
158
  manifest: list[dict] = [{}]
146
159
  if args.type == GCP_FILESTORE_TYPE:
@@ -244,6 +257,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
244
257
 
245
258
 
246
259
  def storage_list(args: Namespace) -> None:
260
+ if should_validate_dependencies(args):
261
+ validate_dependencies_list(
262
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
263
+ )
247
264
  storages = []
248
265
  if not is_dry_run():
249
266
  k8s_api_client = setup_k8s_env(args)
@@ -252,6 +269,10 @@ def storage_list(args: Namespace) -> None:
252
269
 
253
270
 
254
271
  def storage_detach(args: Namespace) -> None:
272
+ if should_validate_dependencies(args):
273
+ validate_dependencies_list(
274
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
275
+ )
255
276
  k8s_api_client = setup_k8s_env(args)
256
277
  storage = get_storage(k8s_api_client, args.name)
257
278
  delete_storage_resources(k8s_api_client, storage)
xpk/commands/version.py CHANGED
@@ -18,10 +18,6 @@ from ..core.config import __version__
18
18
  from ..utils.console import xpk_print
19
19
 
20
20
 
21
- def get_xpk_version() -> str:
22
- return __version__
23
-
24
-
25
21
  def version(args) -> None: # pylint: disable=unused-argument
26
22
  """Get version of xpk."""
27
23
  xpk_print('xpk_version:', __version__)
xpk/commands/workload.py CHANGED
@@ -34,7 +34,7 @@ from ..core.docker_container import (
34
34
  )
35
35
  from ..core.docker_resources import get_volumes, parse_env_config
36
36
  from ..core.gcloud_context import add_zone_and_project
37
- from ..core.kueue import LOCAL_QUEUE_NAME
37
+ from ..core.kueue_manager import LOCAL_QUEUE_NAME
38
38
  from ..core.monitoring import get_gke_outlier_dashboard
39
39
  from ..core.nap import (
40
40
  get_autoprovisioning_node_selector_args,
@@ -53,9 +53,6 @@ from ..core.pathways import (
53
53
  try_to_delete_pathwaysjob_first,
54
54
  )
55
55
  from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
56
- from ..core.capacity import (
57
- CapacityType,
58
- )
59
56
  from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
60
57
  from ..core.scheduling import (
61
58
  check_if_workload_can_schedule,
@@ -65,6 +62,7 @@ from ..core.scheduling import (
65
62
  create_tpu_topology,
66
63
  get_cpu_affinity,
67
64
  get_gpu_scheduler,
65
+ create_sub_slicing_annotations,
68
66
  )
69
67
  from ..core.storage import (
70
68
  GCE_PD_TYPE,
@@ -87,7 +85,7 @@ from ..core.workload import (
87
85
  get_jobsets_list_gcp_link,
88
86
  get_workload_list,
89
87
  wait_for_job_completion,
90
- zone_to_region,
88
+ get_cluster_location,
91
89
  )
92
90
  from ..core.workload_decorators import (
93
91
  rdma_decorator,
@@ -98,8 +96,10 @@ from ..core.workload_decorators import (
98
96
  from ..utils.console import get_user_input, xpk_exit, xpk_print
99
97
  from ..utils.file import write_tmp_file
100
98
  from ..utils.execution_context import is_dry_run
99
+ from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
101
100
  from . import cluster_gcluster
102
101
  from .common import is_TAS_possible
102
+ from ..utils.feature_flags import FeatureFlags
103
103
 
104
104
  WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
105
105
  kind: JobSet
@@ -130,6 +130,7 @@ spec:
130
130
  xpk.google.com/workload: {args.workload}
131
131
  annotations:
132
132
  {storage_annotations}
133
+ {sub_slicing_annotations}
133
134
  spec:
134
135
  schedulerName: {args.scheduler}
135
136
  imagePullSecrets:
@@ -267,6 +268,8 @@ PW_WORKLOAD_CREATE_YAML = """
267
268
  maxSliceRestarts: {args.max_slice_restarts}
268
269
  terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
269
270
  priorityClassName: {args.priority}
271
+ nodeSelector:
272
+ {autoprovisioning_args}
270
273
  pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
271
274
  controller:
272
275
  # #Pod template for training, default mode.
@@ -307,6 +310,12 @@ def workload_create(args) -> None:
307
310
  Returns:
308
311
  0 if successful and 1 otherwise.
309
312
  """
313
+ if should_validate_dependencies(args):
314
+ validate_dependencies_list([
315
+ SystemDependency.KUBECTL,
316
+ SystemDependency.GCLOUD,
317
+ SystemDependency.DOCKER,
318
+ ])
310
319
  k8s_api_client = None
311
320
  if not is_dry_run():
312
321
  k8s_api_client = setup_k8s_env(args)
@@ -334,7 +343,7 @@ def workload_create(args) -> None:
334
343
  xpk_print('Starting workload create', flush=True)
335
344
 
336
345
  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
337
- cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
346
+ cluster_config_map = get_cluster_configmap(metadata_configmap_name)
338
347
  cluster_xpk_version = None
339
348
  if cluster_config_map is None:
340
349
  xpk_print(
@@ -482,16 +491,12 @@ def workload_create(args) -> None:
482
491
  capacity_type = get_cluster_capacity_type(args)
483
492
 
484
493
  annotations = (
485
- ''
486
- if not is_TAS_possible(
487
- system_characteristics,
488
- capacity_type,
489
- flex=True if capacity_type == CapacityType.FLEX_START else False,
490
- )
491
- else (
494
+ (
492
495
  'kueue.x-k8s.io/podset-preferred-topology:'
493
496
  ' "cloud.google.com/gce-topology-host"'
494
497
  )
498
+ if is_TAS_possible(system_characteristics, capacity_type)
499
+ else ''
495
500
  )
496
501
 
497
502
  if (
@@ -507,7 +512,7 @@ def workload_create(args) -> None:
507
512
  annotations=annotations,
508
513
  )
509
514
 
510
- sub_networks = get_cluster_subnetworks(args)
515
+ sub_networks = get_cluster_subnetworks()
511
516
  if args.device_type == a3high_device_type:
512
517
  yml_string = tcpx_decorator.decorate_jobset(yml_string)
513
518
  elif args.device_type == a3mega_device_type:
@@ -545,6 +550,7 @@ def workload_create(args) -> None:
545
550
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
546
551
  user_workload=get_user_workload_for_pathways(args, system),
547
552
  local_queue_name=LOCAL_QUEUE_NAME,
553
+ autoprovisioning_args=autoprovisioning_args,
548
554
  )
549
555
  else:
550
556
  container, debugging_dashboard_id = get_user_workload_container(
@@ -558,6 +564,14 @@ def workload_create(args) -> None:
558
564
  accelerator_label=create_accelerator_label(
559
565
  system.accelerator_type, system
560
566
  ),
567
+ sub_slicing_annotations=(
568
+ ''
569
+ if not FeatureFlags.SUB_SLICING_ENABLED
570
+ or args.sub_slicing_topology is None
571
+ else ('\n' + (' ' * 16)).join(
572
+ create_sub_slicing_annotations(args.sub_slicing_topology)
573
+ )
574
+ ),
561
575
  machine_label=create_machine_label(system.accelerator_type, system),
562
576
  local_queue_name=LOCAL_QUEUE_NAME,
563
577
  autoprovisioning_args=autoprovisioning_args,
@@ -575,7 +589,7 @@ def workload_create(args) -> None:
575
589
  )
576
590
  tmp = write_tmp_file(yml_string)
577
591
  command = f'kubectl apply -f {str(tmp)}'
578
- return_code = run_command_with_updates(command, 'Creating Workload', args)
592
+ return_code = run_command_with_updates(command, 'Creating Workload')
579
593
 
580
594
  if return_code != 0:
581
595
  xpk_print(f'Create Workload request returned ERROR {return_code}')
@@ -622,7 +636,9 @@ def workload_create(args) -> None:
622
636
  ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
623
637
  " python -c 'import pathwaysutils; import jax; print(jax.devices())'"
624
638
  )
625
- pathways_proxy_link = f'https://console.cloud.google.com/kubernetes/job/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
639
+ pathways_proxy_link = (
640
+ f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
641
+ )
626
642
  xpk_print(
627
643
  'Follow the proxy here:'
628
644
  # pylint: disable=line-too-long)
@@ -636,7 +652,7 @@ def workload_create(args) -> None:
636
652
  xpk_print(
637
653
  'Follow your workload here:'
638
654
  # pylint: disable=line-too-long
639
- f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
655
+ f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
640
656
  )
641
657
  duration_of_logs = 'P1D' # Past 1 Day
642
658
  xpk_print(
@@ -645,7 +661,7 @@ def workload_create(args) -> None:
645
661
  ' ([prefix]-slice-job-[slice_number]-[worker_number])'
646
662
  ' after clicking the url if you want other worker logs.'
647
663
  # pylint: disable=line-too-long
648
- f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{zone_to_region(args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
664
+ f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
649
665
  )
650
666
 
651
667
  xpk_exit(0)
@@ -678,6 +694,10 @@ def workload_delete(args) -> None:
678
694
  Returns:
679
695
  0 if successful and 1 otherwise.
680
696
  """
697
+ if should_validate_dependencies(args):
698
+ validate_dependencies_list(
699
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
700
+ )
681
701
  xpk_print('Starting Workload delete', flush=True)
682
702
  add_zone_and_project(args)
683
703
  get_cluster_credentials(args)
@@ -725,16 +745,13 @@ def workload_delete(args) -> None:
725
745
 
726
746
  # Not batching deletion for single workload
727
747
  if len(workloads) == 1:
728
- return_code = run_command_with_updates(
729
- commands[0], 'Delete Workload', args
730
- )
748
+ return_code = run_command_with_updates(commands[0], 'Delete Workload')
731
749
  else:
732
750
  return_code = run_commands(
733
751
  commands,
734
752
  'Delete Workload',
735
753
  task_names,
736
754
  batch=100,
737
- dry_run=args.dry_run,
738
755
  )
739
756
 
740
757
  if return_code != 0:
@@ -752,6 +769,10 @@ def workload_list(args) -> None:
752
769
  Returns:
753
770
  0 if successful and 1 otherwise.
754
771
  """
772
+ if should_validate_dependencies(args):
773
+ validate_dependencies_list(
774
+ [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
775
+ )
755
776
  xpk_print('Starting workload list', flush=True)
756
777
  add_zone_and_project(args)
757
778
  get_cluster_credentials(args)