xpk 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +9 -2
- xpk/commands/cluster.py +128 -115
- xpk/commands/cluster_gcluster.py +77 -14
- xpk/commands/cluster_gcluster_test.py +177 -0
- xpk/commands/common.py +10 -28
- xpk/commands/info.py +11 -9
- xpk/commands/inspector.py +21 -10
- xpk/commands/job.py +25 -9
- xpk/commands/kind.py +38 -40
- xpk/commands/kjob_common.py +4 -4
- xpk/commands/run.py +9 -2
- xpk/commands/shell.py +13 -10
- xpk/commands/storage.py +21 -0
- xpk/commands/version.py +0 -4
- xpk/commands/workload.py +43 -22
- xpk/core/blueprint/blueprint_generator.py +4 -40
- xpk/core/blueprint/blueprint_test.py +0 -6
- xpk/core/capacity.py +6 -5
- xpk/core/cluster.py +91 -194
- xpk/core/cluster_private.py +6 -11
- xpk/core/commands.py +11 -18
- xpk/core/config.py +1 -1
- xpk/core/docker_image.py +3 -4
- xpk/core/gcloud_context.py +26 -2
- xpk/core/gcloud_context_test.py +96 -0
- xpk/core/gcluster_manager.py +0 -3
- xpk/core/jobset.py +4 -7
- xpk/core/kjob.py +14 -27
- xpk/core/kueue_manager.py +383 -0
- xpk/core/kueue_manager_test.py +542 -0
- xpk/core/monitoring.py +1 -1
- xpk/core/nap.py +10 -15
- xpk/core/network.py +17 -18
- xpk/core/nodepool.py +66 -77
- xpk/core/nodepool_test.py +198 -1
- xpk/core/pathways.py +5 -5
- xpk/core/ray.py +10 -14
- xpk/core/resources.py +6 -11
- xpk/core/scheduling.py +19 -1
- xpk/core/scheduling_test.py +31 -0
- xpk/core/system_characteristics.py +335 -229
- xpk/core/vertex.py +1 -1
- xpk/core/workload.py +7 -8
- xpk/main.py +2 -4
- xpk/parser/cluster.py +7 -0
- xpk/parser/cluster_test.py +66 -0
- xpk/parser/common.py +11 -0
- xpk/parser/workload.py +62 -25
- xpk/parser/workload_test.py +82 -0
- xpk/utils/feature_flags.py +28 -0
- xpk/utils/kueue.py +20 -0
- xpk/utils/templates.py +2 -0
- xpk/utils/topology.py +37 -0
- xpk/utils/topology_test.py +43 -0
- xpk/utils/validation.py +79 -55
- xpk/utils/validation_test.py +37 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/METADATA +6 -1
- xpk-0.14.0.dist-info/RECORD +112 -0
- xpk/core/kueue.py +0 -561
- xpk-0.13.0.dist-info/RECORD +0 -101
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/WHEEL +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.13.0.dist-info → xpk-0.14.0.dist-info}/top_level.txt +0 -0
xpk/commands/kind.py
CHANGED
|
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from ..core.kueue_manager import (KueueConfig, KueueManager)
|
|
17
18
|
from ..core.commands import (
|
|
18
19
|
run_command_for_value,
|
|
19
20
|
run_command_with_updates,
|
|
@@ -24,17 +25,14 @@ from ..core.kjob import (
|
|
|
24
25
|
prepare_kjob,
|
|
25
26
|
apply_kjob_crds,
|
|
26
27
|
)
|
|
27
|
-
from ..core.
|
|
28
|
-
install_kueue_on_cluster,
|
|
29
|
-
install_kueue_crs,
|
|
30
|
-
wait_for_kueue_available,
|
|
31
|
-
)
|
|
28
|
+
from ..core.scheduling import get_total_chips_requested_from_args
|
|
32
29
|
from ..core.storage import install_storage_crd
|
|
33
30
|
from ..core.system_characteristics import (
|
|
34
31
|
SystemCharacteristics,
|
|
35
32
|
AcceleratorType,
|
|
36
33
|
)
|
|
37
34
|
from ..utils.console import (xpk_exit, xpk_print)
|
|
35
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
38
36
|
|
|
39
37
|
|
|
40
38
|
def cluster_create(args) -> None:
|
|
@@ -46,6 +44,12 @@ def cluster_create(args) -> None:
|
|
|
46
44
|
Returns:
|
|
47
45
|
0 if successful and 1 otherwise.
|
|
48
46
|
"""
|
|
47
|
+
if should_validate_dependencies(args):
|
|
48
|
+
validate_dependencies_list([
|
|
49
|
+
SystemDependency.KUBECTL,
|
|
50
|
+
SystemDependency.KJOB,
|
|
51
|
+
SystemDependency.GCLOUD,
|
|
52
|
+
])
|
|
49
53
|
xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
|
|
50
54
|
|
|
51
55
|
create_cluster_command_code = create_cluster_if_necessary(args)
|
|
@@ -64,18 +68,13 @@ def cluster_create(args) -> None:
|
|
|
64
68
|
if set_jobset_on_cluster_code != 0:
|
|
65
69
|
xpk_exit(set_jobset_on_cluster_code)
|
|
66
70
|
|
|
67
|
-
xpk_print('Enabling Kueue on the cluster')
|
|
68
|
-
install_kueue_on_cluster_code = install_kueue_on_cluster(args)
|
|
69
|
-
if install_kueue_on_cluster_code != 0:
|
|
70
|
-
xpk_exit(install_kueue_on_cluster_code)
|
|
71
|
-
|
|
72
71
|
xpk_print('Verifying kjob installation')
|
|
73
|
-
err_code = verify_kjob_installed(
|
|
72
|
+
err_code = verify_kjob_installed()
|
|
74
73
|
if err_code > 0:
|
|
75
74
|
xpk_exit(err_code)
|
|
76
75
|
|
|
77
76
|
xpk_print('Applying kjob CDRs')
|
|
78
|
-
err_code = apply_kjob_crds(
|
|
77
|
+
err_code = apply_kjob_crds()
|
|
79
78
|
if err_code > 0:
|
|
80
79
|
xpk_exit(err_code)
|
|
81
80
|
|
|
@@ -87,11 +86,6 @@ def cluster_create(args) -> None:
|
|
|
87
86
|
k8s_client = setup_k8s_env(args)
|
|
88
87
|
install_storage_crd(k8s_client)
|
|
89
88
|
|
|
90
|
-
xpk_print('Wait for Kueue to be fully available')
|
|
91
|
-
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
92
|
-
if wait_for_kueue_available_code != 0:
|
|
93
|
-
xpk_exit(wait_for_kueue_available_code)
|
|
94
|
-
|
|
95
89
|
args.num_slices = 1
|
|
96
90
|
args.enable_pathways = False
|
|
97
91
|
system = SystemCharacteristics(
|
|
@@ -102,12 +96,22 @@ def cluster_create(args) -> None:
|
|
|
102
96
|
1,
|
|
103
97
|
AcceleratorType['CPU'],
|
|
104
98
|
'kind',
|
|
99
|
+
supports_sub_slicing=False,
|
|
105
100
|
)
|
|
106
101
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
102
|
+
kueue_manager = KueueManager()
|
|
103
|
+
kueue_manager.install_or_upgrade(
|
|
104
|
+
KueueConfig(
|
|
105
|
+
system,
|
|
106
|
+
total_chips=get_total_chips_requested_from_args(args, system),
|
|
107
|
+
autoprovisioning_enabled=False,
|
|
108
|
+
num_slices=args.num_slices,
|
|
109
|
+
memory_limit='',
|
|
110
|
+
cpu_limit=0,
|
|
111
|
+
is_pathways_cluster=False,
|
|
112
|
+
flex=False,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
111
115
|
|
|
112
116
|
xpk_print('Kind commands done! Resources are created.')
|
|
113
117
|
xpk_exit(0)
|
|
@@ -122,6 +126,8 @@ def cluster_delete(args) -> None:
|
|
|
122
126
|
Returns:
|
|
123
127
|
0 if successful and 1 otherwise.
|
|
124
128
|
"""
|
|
129
|
+
if should_validate_dependencies(args):
|
|
130
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
125
131
|
xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
|
|
126
132
|
|
|
127
133
|
run_kind_cluster_delete_command_code = run_kind_cluster_delete_command(args)
|
|
@@ -134,13 +140,12 @@ def cluster_delete(args) -> None:
|
|
|
134
140
|
def cluster_list(args) -> None:
|
|
135
141
|
"""Function around cluster list.
|
|
136
142
|
|
|
137
|
-
Args:
|
|
138
|
-
args: user provided arguments for running the command.
|
|
139
|
-
|
|
140
143
|
Returns:
|
|
141
144
|
0 if successful and 1 otherwise.
|
|
142
145
|
"""
|
|
143
|
-
if
|
|
146
|
+
if should_validate_dependencies(args):
|
|
147
|
+
validate_dependencies_list([SystemDependency.GCLOUD])
|
|
148
|
+
if run_kind_clusters_list_command():
|
|
144
149
|
xpk_exit(1)
|
|
145
150
|
xpk_exit(0)
|
|
146
151
|
|
|
@@ -154,7 +159,7 @@ def create_cluster_if_necessary(args) -> int:
|
|
|
154
159
|
Returns:
|
|
155
160
|
0 if successful and 1 otherwise.
|
|
156
161
|
"""
|
|
157
|
-
all_clusters, return_code = get_all_local_clusters_programmatic(
|
|
162
|
+
all_clusters, return_code = get_all_local_clusters_programmatic()
|
|
158
163
|
if return_code > 0:
|
|
159
164
|
xpk_print('Listing all clusters failed!')
|
|
160
165
|
return 1
|
|
@@ -179,7 +184,7 @@ def run_kind_cluster_delete_command(args) -> int:
|
|
|
179
184
|
if args.cluster:
|
|
180
185
|
command += f' --name={args.cluster}'
|
|
181
186
|
|
|
182
|
-
return_code = run_command_with_updates(command, 'Cluster Delete'
|
|
187
|
+
return_code = run_command_with_updates(command, 'Cluster Delete')
|
|
183
188
|
if return_code != 0:
|
|
184
189
|
xpk_print(f'Cluster delete request returned ERROR {return_code}')
|
|
185
190
|
return 1
|
|
@@ -187,17 +192,14 @@ def run_kind_cluster_delete_command(args) -> int:
|
|
|
187
192
|
return 0
|
|
188
193
|
|
|
189
194
|
|
|
190
|
-
def run_kind_clusters_list_command(
|
|
195
|
+
def run_kind_clusters_list_command() -> int:
|
|
191
196
|
"""List Kind Clusters within the project and location.
|
|
192
197
|
|
|
193
|
-
Args:
|
|
194
|
-
args: user provided arguments for running the command.
|
|
195
|
-
|
|
196
198
|
Returns:
|
|
197
199
|
0 if successful and 1 otherwise.
|
|
198
200
|
"""
|
|
199
201
|
command = 'kind get clusters'
|
|
200
|
-
return_code = run_command_with_updates(command, 'Cluster List'
|
|
202
|
+
return_code = run_command_with_updates(command, 'Cluster List')
|
|
201
203
|
if return_code != 0:
|
|
202
204
|
xpk_print(f'Cluster list request returned ERROR {return_code}')
|
|
203
205
|
return 1
|
|
@@ -222,25 +224,22 @@ def run_kind_cluster_create_command(args) -> int:
|
|
|
222
224
|
if args.k8s_version:
|
|
223
225
|
command += f' --image=kindest/node:v{args.k8s_version}'
|
|
224
226
|
|
|
225
|
-
return_code = run_command_with_updates(command, 'Kind Cluster Create'
|
|
227
|
+
return_code = run_command_with_updates(command, 'Kind Cluster Create')
|
|
226
228
|
if return_code != 0:
|
|
227
229
|
xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
|
|
228
230
|
return 1
|
|
229
231
|
return 0
|
|
230
232
|
|
|
231
233
|
|
|
232
|
-
def get_all_local_clusters_programmatic(
|
|
234
|
+
def get_all_local_clusters_programmatic() -> tuple[list[str], int]:
|
|
233
235
|
"""Gets all the local clusters.
|
|
234
236
|
|
|
235
|
-
Args:
|
|
236
|
-
args: user provided arguments for running the command.
|
|
237
|
-
|
|
238
237
|
Returns:
|
|
239
238
|
List of cluster names and 0 if successful and 1 otherwise.
|
|
240
239
|
"""
|
|
241
240
|
command = 'kind get clusters'
|
|
242
241
|
return_code, raw_cluster_output = run_command_for_value(
|
|
243
|
-
command, 'Find if Cluster Exists'
|
|
242
|
+
command, 'Find if Cluster Exists'
|
|
244
243
|
)
|
|
245
244
|
if return_code != 0:
|
|
246
245
|
xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
|
|
@@ -261,7 +260,7 @@ def set_local_cluster_command(args) -> int:
|
|
|
261
260
|
if not args.cluster:
|
|
262
261
|
command = 'kubectl config current-context'
|
|
263
262
|
return_code, current_context = run_command_for_value(
|
|
264
|
-
command, 'get current-context'
|
|
263
|
+
command, 'get current-context'
|
|
265
264
|
)
|
|
266
265
|
xpk_print(
|
|
267
266
|
'No local cluster name specified. Using current-context'
|
|
@@ -276,7 +275,6 @@ def set_local_cluster_command(args) -> int:
|
|
|
276
275
|
return_code = run_command_with_updates(
|
|
277
276
|
command,
|
|
278
277
|
task,
|
|
279
|
-
args,
|
|
280
278
|
)
|
|
281
279
|
if return_code != 0:
|
|
282
280
|
xpk_print(f'{task} returned ERROR {return_code}')
|
xpk/commands/kjob_common.py
CHANGED
|
@@ -35,11 +35,11 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
35
35
|
|
|
36
36
|
annotations: tuple
|
|
37
37
|
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
38
|
-
annotations = get_a3mega_pod_template_annotations(
|
|
38
|
+
annotations = get_a3mega_pod_template_annotations()
|
|
39
39
|
elif gpu_type == H200_DEVICE_TYPE:
|
|
40
|
-
annotations = get_a3ultra_pod_template_annotations(
|
|
40
|
+
annotations = get_a3ultra_pod_template_annotations()
|
|
41
41
|
elif gpu_type == B200_DEVICE_TYPE:
|
|
42
|
-
annotations = get_a4_pod_template_annotations(
|
|
42
|
+
annotations = get_a4_pod_template_annotations()
|
|
43
43
|
else:
|
|
44
44
|
annotations = tuple()
|
|
45
45
|
|
|
@@ -54,7 +54,7 @@ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
|
54
54
|
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
55
55
|
system_characteristics = get_cluster_system_characteristics(args)
|
|
56
56
|
capacity_type = get_cluster_capacity_type(args)
|
|
57
|
-
if is_TAS_possible(system_characteristics, capacity_type
|
|
57
|
+
if is_TAS_possible(system_characteristics, capacity_type):
|
|
58
58
|
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
59
59
|
|
|
60
60
|
return cmd
|
xpk/commands/run.py
CHANGED
|
@@ -28,8 +28,9 @@ from ..core.kjob import (
|
|
|
28
28
|
get_storage_annotations,
|
|
29
29
|
prepare_kjob,
|
|
30
30
|
)
|
|
31
|
-
from ..core.
|
|
31
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
32
32
|
from ..utils.console import xpk_exit, xpk_print
|
|
33
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
33
34
|
from .kind import set_local_cluster_command
|
|
34
35
|
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
35
36
|
|
|
@@ -42,6 +43,12 @@ def run(args: Namespace) -> None:
|
|
|
42
43
|
Returns:
|
|
43
44
|
None
|
|
44
45
|
"""
|
|
46
|
+
if should_validate_dependencies(args):
|
|
47
|
+
validate_dependencies_list([
|
|
48
|
+
SystemDependency.KUBECTL,
|
|
49
|
+
SystemDependency.KJOB,
|
|
50
|
+
SystemDependency.GCLOUD,
|
|
51
|
+
])
|
|
45
52
|
if not args.kind_cluster:
|
|
46
53
|
add_zone_and_project(args)
|
|
47
54
|
get_cluster_credentials(args)
|
|
@@ -126,7 +133,7 @@ def submit_job(args: Namespace) -> None:
|
|
|
126
133
|
if args.time is not None:
|
|
127
134
|
cmd += f' --time {args.time}'
|
|
128
135
|
|
|
129
|
-
return_code = run_command_with_full_controls(cmd, 'run task'
|
|
136
|
+
return_code = run_command_with_full_controls(cmd, 'run task')
|
|
130
137
|
|
|
131
138
|
if return_code != 0:
|
|
132
139
|
xpk_print(f'Running task returned ERROR {return_code}')
|
xpk/commands/shell.py
CHANGED
|
@@ -14,6 +14,7 @@ limitations under the License.
|
|
|
14
14
|
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
|
|
15
15
|
from ..core.cluster import get_cluster_credentials, add_zone_and_project, setup_k8s_service_accounts
|
|
16
16
|
from ..utils.console import xpk_exit, xpk_print
|
|
17
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
17
18
|
from argparse import Namespace
|
|
18
19
|
|
|
19
20
|
from ..core.kjob import (
|
|
@@ -33,14 +34,18 @@ def shell(args: Namespace):
|
|
|
33
34
|
Returns:
|
|
34
35
|
0 if successful and 1 otherwise.
|
|
35
36
|
"""
|
|
37
|
+
if should_validate_dependencies(args):
|
|
38
|
+
validate_dependencies_list([
|
|
39
|
+
SystemDependency.KUBECTL,
|
|
40
|
+
SystemDependency.KJOB,
|
|
41
|
+
SystemDependency.GCLOUD,
|
|
42
|
+
])
|
|
36
43
|
exisitng_shell_pod_name = get_existing_shell_pod_name(args)
|
|
37
44
|
|
|
38
45
|
if exisitng_shell_pod_name is None:
|
|
39
46
|
return_code = connect_to_new_interactive_shell(args)
|
|
40
47
|
else:
|
|
41
|
-
return_code = connect_to_existing_interactive_shell(
|
|
42
|
-
exisitng_shell_pod_name, args
|
|
43
|
-
)
|
|
48
|
+
return_code = connect_to_existing_interactive_shell(exisitng_shell_pod_name)
|
|
44
49
|
|
|
45
50
|
if return_code != 0:
|
|
46
51
|
xpk_print(f'The command failed with code {return_code}.')
|
|
@@ -60,7 +65,6 @@ def get_existing_shell_pod_name(args: Namespace) -> str | None:
|
|
|
60
65
|
' -o custom-columns=":metadata.name"'
|
|
61
66
|
),
|
|
62
67
|
task='Get existing interactive shell pod name.',
|
|
63
|
-
global_args=args,
|
|
64
68
|
)
|
|
65
69
|
if return_code != 0:
|
|
66
70
|
xpk_print(
|
|
@@ -95,21 +99,17 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
95
99
|
return run_command_with_full_controls(
|
|
96
100
|
command=cmd,
|
|
97
101
|
task='Creating new interactive shell and entering it',
|
|
98
|
-
global_args=args,
|
|
99
102
|
instructions=exit_instructions,
|
|
100
103
|
)
|
|
101
104
|
|
|
102
105
|
|
|
103
|
-
def connect_to_existing_interactive_shell(
|
|
104
|
-
pod_name: str, args: Namespace
|
|
105
|
-
) -> int:
|
|
106
|
+
def connect_to_existing_interactive_shell(pod_name: str) -> int:
|
|
106
107
|
return run_command_with_full_controls(
|
|
107
108
|
command=(
|
|
108
109
|
f'kubectl exec --stdin --tty {pod_name} --'
|
|
109
110
|
f' {get_pod_template_interactive_command()}'
|
|
110
111
|
),
|
|
111
112
|
task='Entering existing interactive shell',
|
|
112
|
-
global_args=args,
|
|
113
113
|
instructions=exit_instructions,
|
|
114
114
|
)
|
|
115
115
|
|
|
@@ -121,6 +121,10 @@ def shell_stop(args: Namespace):
|
|
|
121
121
|
Returns:
|
|
122
122
|
0 if successful and 1 otherwise.
|
|
123
123
|
"""
|
|
124
|
+
if should_validate_dependencies(args):
|
|
125
|
+
validate_dependencies_list(
|
|
126
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
127
|
+
)
|
|
124
128
|
exisitng_shell_pod_name = get_existing_shell_pod_name(args)
|
|
125
129
|
|
|
126
130
|
if exisitng_shell_pod_name is None:
|
|
@@ -130,7 +134,6 @@ def shell_stop(args: Namespace):
|
|
|
130
134
|
return_code = run_command_with_updates(
|
|
131
135
|
command=f'kubectl delete pod {exisitng_shell_pod_name}',
|
|
132
136
|
task='Deleting the existing shell.',
|
|
133
|
-
global_args=args,
|
|
134
137
|
)
|
|
135
138
|
if return_code != 0:
|
|
136
139
|
xpk_exit(return_code)
|
xpk/commands/storage.py
CHANGED
|
@@ -59,9 +59,14 @@ from ..core.storage import (
|
|
|
59
59
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
60
60
|
from ..utils.kubectl import apply_kubectl_manifest
|
|
61
61
|
from ..utils.execution_context import is_dry_run
|
|
62
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
62
63
|
|
|
63
64
|
|
|
64
65
|
def storage_create(args: Namespace) -> None:
|
|
66
|
+
if should_validate_dependencies(args):
|
|
67
|
+
validate_dependencies_list(
|
|
68
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
69
|
+
)
|
|
65
70
|
add_zone_and_project(args)
|
|
66
71
|
if args.type == GCP_FILESTORE_TYPE:
|
|
67
72
|
if args.instance is None:
|
|
@@ -107,6 +112,10 @@ def storage_create(args: Namespace) -> None:
|
|
|
107
112
|
|
|
108
113
|
|
|
109
114
|
def storage_delete(args: Namespace) -> None:
|
|
115
|
+
if should_validate_dependencies(args):
|
|
116
|
+
validate_dependencies_list(
|
|
117
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
118
|
+
)
|
|
110
119
|
add_zone_and_project(args)
|
|
111
120
|
k8s_api_client = setup_k8s_env(args)
|
|
112
121
|
storages = list_storages(k8s_api_client)
|
|
@@ -141,6 +150,10 @@ def storage_delete(args: Namespace) -> None:
|
|
|
141
150
|
|
|
142
151
|
|
|
143
152
|
def storage_attach(args: Namespace) -> None:
|
|
153
|
+
if should_validate_dependencies(args):
|
|
154
|
+
validate_dependencies_list(
|
|
155
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
156
|
+
)
|
|
144
157
|
add_zone_and_project(args)
|
|
145
158
|
manifest: list[dict] = [{}]
|
|
146
159
|
if args.type == GCP_FILESTORE_TYPE:
|
|
@@ -244,6 +257,10 @@ def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
|
244
257
|
|
|
245
258
|
|
|
246
259
|
def storage_list(args: Namespace) -> None:
|
|
260
|
+
if should_validate_dependencies(args):
|
|
261
|
+
validate_dependencies_list(
|
|
262
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
263
|
+
)
|
|
247
264
|
storages = []
|
|
248
265
|
if not is_dry_run():
|
|
249
266
|
k8s_api_client = setup_k8s_env(args)
|
|
@@ -252,6 +269,10 @@ def storage_list(args: Namespace) -> None:
|
|
|
252
269
|
|
|
253
270
|
|
|
254
271
|
def storage_detach(args: Namespace) -> None:
|
|
272
|
+
if should_validate_dependencies(args):
|
|
273
|
+
validate_dependencies_list(
|
|
274
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
275
|
+
)
|
|
255
276
|
k8s_api_client = setup_k8s_env(args)
|
|
256
277
|
storage = get_storage(k8s_api_client, args.name)
|
|
257
278
|
delete_storage_resources(k8s_api_client, storage)
|
xpk/commands/version.py
CHANGED
|
@@ -18,10 +18,6 @@ from ..core.config import __version__
|
|
|
18
18
|
from ..utils.console import xpk_print
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def get_xpk_version() -> str:
|
|
22
|
-
return __version__
|
|
23
|
-
|
|
24
|
-
|
|
25
21
|
def version(args) -> None: # pylint: disable=unused-argument
|
|
26
22
|
"""Get version of xpk."""
|
|
27
23
|
xpk_print('xpk_version:', __version__)
|
xpk/commands/workload.py
CHANGED
|
@@ -34,7 +34,7 @@ from ..core.docker_container import (
|
|
|
34
34
|
)
|
|
35
35
|
from ..core.docker_resources import get_volumes, parse_env_config
|
|
36
36
|
from ..core.gcloud_context import add_zone_and_project
|
|
37
|
-
from ..core.
|
|
37
|
+
from ..core.kueue_manager import LOCAL_QUEUE_NAME
|
|
38
38
|
from ..core.monitoring import get_gke_outlier_dashboard
|
|
39
39
|
from ..core.nap import (
|
|
40
40
|
get_autoprovisioning_node_selector_args,
|
|
@@ -53,9 +53,6 @@ from ..core.pathways import (
|
|
|
53
53
|
try_to_delete_pathwaysjob_first,
|
|
54
54
|
)
|
|
55
55
|
from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
|
|
56
|
-
from ..core.capacity import (
|
|
57
|
-
CapacityType,
|
|
58
|
-
)
|
|
59
56
|
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
|
|
60
57
|
from ..core.scheduling import (
|
|
61
58
|
check_if_workload_can_schedule,
|
|
@@ -65,6 +62,7 @@ from ..core.scheduling import (
|
|
|
65
62
|
create_tpu_topology,
|
|
66
63
|
get_cpu_affinity,
|
|
67
64
|
get_gpu_scheduler,
|
|
65
|
+
create_sub_slicing_annotations,
|
|
68
66
|
)
|
|
69
67
|
from ..core.storage import (
|
|
70
68
|
GCE_PD_TYPE,
|
|
@@ -87,7 +85,7 @@ from ..core.workload import (
|
|
|
87
85
|
get_jobsets_list_gcp_link,
|
|
88
86
|
get_workload_list,
|
|
89
87
|
wait_for_job_completion,
|
|
90
|
-
|
|
88
|
+
get_cluster_location,
|
|
91
89
|
)
|
|
92
90
|
from ..core.workload_decorators import (
|
|
93
91
|
rdma_decorator,
|
|
@@ -98,8 +96,10 @@ from ..core.workload_decorators import (
|
|
|
98
96
|
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
99
97
|
from ..utils.file import write_tmp_file
|
|
100
98
|
from ..utils.execution_context import is_dry_run
|
|
99
|
+
from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
|
|
101
100
|
from . import cluster_gcluster
|
|
102
101
|
from .common import is_TAS_possible
|
|
102
|
+
from ..utils.feature_flags import FeatureFlags
|
|
103
103
|
|
|
104
104
|
WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
|
|
105
105
|
kind: JobSet
|
|
@@ -130,6 +130,7 @@ spec:
|
|
|
130
130
|
xpk.google.com/workload: {args.workload}
|
|
131
131
|
annotations:
|
|
132
132
|
{storage_annotations}
|
|
133
|
+
{sub_slicing_annotations}
|
|
133
134
|
spec:
|
|
134
135
|
schedulerName: {args.scheduler}
|
|
135
136
|
imagePullSecrets:
|
|
@@ -267,6 +268,8 @@ PW_WORKLOAD_CREATE_YAML = """
|
|
|
267
268
|
maxSliceRestarts: {args.max_slice_restarts}
|
|
268
269
|
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
|
|
269
270
|
priorityClassName: {args.priority}
|
|
271
|
+
nodeSelector:
|
|
272
|
+
{autoprovisioning_args}
|
|
270
273
|
pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
|
|
271
274
|
controller:
|
|
272
275
|
# #Pod template for training, default mode.
|
|
@@ -307,6 +310,12 @@ def workload_create(args) -> None:
|
|
|
307
310
|
Returns:
|
|
308
311
|
0 if successful and 1 otherwise.
|
|
309
312
|
"""
|
|
313
|
+
if should_validate_dependencies(args):
|
|
314
|
+
validate_dependencies_list([
|
|
315
|
+
SystemDependency.KUBECTL,
|
|
316
|
+
SystemDependency.GCLOUD,
|
|
317
|
+
SystemDependency.DOCKER,
|
|
318
|
+
])
|
|
310
319
|
k8s_api_client = None
|
|
311
320
|
if not is_dry_run():
|
|
312
321
|
k8s_api_client = setup_k8s_env(args)
|
|
@@ -334,7 +343,7 @@ def workload_create(args) -> None:
|
|
|
334
343
|
xpk_print('Starting workload create', flush=True)
|
|
335
344
|
|
|
336
345
|
metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
|
|
337
|
-
cluster_config_map = get_cluster_configmap(
|
|
346
|
+
cluster_config_map = get_cluster_configmap(metadata_configmap_name)
|
|
338
347
|
cluster_xpk_version = None
|
|
339
348
|
if cluster_config_map is None:
|
|
340
349
|
xpk_print(
|
|
@@ -482,16 +491,12 @@ def workload_create(args) -> None:
|
|
|
482
491
|
capacity_type = get_cluster_capacity_type(args)
|
|
483
492
|
|
|
484
493
|
annotations = (
|
|
485
|
-
|
|
486
|
-
if not is_TAS_possible(
|
|
487
|
-
system_characteristics,
|
|
488
|
-
capacity_type,
|
|
489
|
-
flex=True if capacity_type == CapacityType.FLEX_START else False,
|
|
490
|
-
)
|
|
491
|
-
else (
|
|
494
|
+
(
|
|
492
495
|
'kueue.x-k8s.io/podset-preferred-topology:'
|
|
493
496
|
' "cloud.google.com/gce-topology-host"'
|
|
494
497
|
)
|
|
498
|
+
if is_TAS_possible(system_characteristics, capacity_type)
|
|
499
|
+
else ''
|
|
495
500
|
)
|
|
496
501
|
|
|
497
502
|
if (
|
|
@@ -507,7 +512,7 @@ def workload_create(args) -> None:
|
|
|
507
512
|
annotations=annotations,
|
|
508
513
|
)
|
|
509
514
|
|
|
510
|
-
sub_networks = get_cluster_subnetworks(
|
|
515
|
+
sub_networks = get_cluster_subnetworks()
|
|
511
516
|
if args.device_type == a3high_device_type:
|
|
512
517
|
yml_string = tcpx_decorator.decorate_jobset(yml_string)
|
|
513
518
|
elif args.device_type == a3mega_device_type:
|
|
@@ -545,6 +550,7 @@ def workload_create(args) -> None:
|
|
|
545
550
|
colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
|
|
546
551
|
user_workload=get_user_workload_for_pathways(args, system),
|
|
547
552
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
553
|
+
autoprovisioning_args=autoprovisioning_args,
|
|
548
554
|
)
|
|
549
555
|
else:
|
|
550
556
|
container, debugging_dashboard_id = get_user_workload_container(
|
|
@@ -558,6 +564,14 @@ def workload_create(args) -> None:
|
|
|
558
564
|
accelerator_label=create_accelerator_label(
|
|
559
565
|
system.accelerator_type, system
|
|
560
566
|
),
|
|
567
|
+
sub_slicing_annotations=(
|
|
568
|
+
''
|
|
569
|
+
if not FeatureFlags.SUB_SLICING_ENABLED
|
|
570
|
+
or args.sub_slicing_topology is None
|
|
571
|
+
else ('\n' + (' ' * 16)).join(
|
|
572
|
+
create_sub_slicing_annotations(args.sub_slicing_topology)
|
|
573
|
+
)
|
|
574
|
+
),
|
|
561
575
|
machine_label=create_machine_label(system.accelerator_type, system),
|
|
562
576
|
local_queue_name=LOCAL_QUEUE_NAME,
|
|
563
577
|
autoprovisioning_args=autoprovisioning_args,
|
|
@@ -575,7 +589,7 @@ def workload_create(args) -> None:
|
|
|
575
589
|
)
|
|
576
590
|
tmp = write_tmp_file(yml_string)
|
|
577
591
|
command = f'kubectl apply -f {str(tmp)}'
|
|
578
|
-
return_code = run_command_with_updates(command, 'Creating Workload'
|
|
592
|
+
return_code = run_command_with_updates(command, 'Creating Workload')
|
|
579
593
|
|
|
580
594
|
if return_code != 0:
|
|
581
595
|
xpk_print(f'Create Workload request returned ERROR {return_code}')
|
|
@@ -622,7 +636,9 @@ def workload_create(args) -> None:
|
|
|
622
636
|
' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
|
|
623
637
|
" python -c 'import pathwaysutils; import jax; print(jax.devices())'"
|
|
624
638
|
)
|
|
625
|
-
pathways_proxy_link =
|
|
639
|
+
pathways_proxy_link = (
|
|
640
|
+
f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
|
|
641
|
+
)
|
|
626
642
|
xpk_print(
|
|
627
643
|
'Follow the proxy here:'
|
|
628
644
|
# pylint: disable=line-too-long)
|
|
@@ -636,7 +652,7 @@ def workload_create(args) -> None:
|
|
|
636
652
|
xpk_print(
|
|
637
653
|
'Follow your workload here:'
|
|
638
654
|
# pylint: disable=line-too-long
|
|
639
|
-
f' https://console.cloud.google.com/kubernetes/service/{
|
|
655
|
+
f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
|
|
640
656
|
)
|
|
641
657
|
duration_of_logs = 'P1D' # Past 1 Day
|
|
642
658
|
xpk_print(
|
|
@@ -645,7 +661,7 @@ def workload_create(args) -> None:
|
|
|
645
661
|
' ([prefix]-slice-job-[slice_number]-[worker_number])'
|
|
646
662
|
' after clicking the url if you want other worker logs.'
|
|
647
663
|
# pylint: disable=line-too-long
|
|
648
|
-
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{
|
|
664
|
+
f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
|
|
649
665
|
)
|
|
650
666
|
|
|
651
667
|
xpk_exit(0)
|
|
@@ -678,6 +694,10 @@ def workload_delete(args) -> None:
|
|
|
678
694
|
Returns:
|
|
679
695
|
0 if successful and 1 otherwise.
|
|
680
696
|
"""
|
|
697
|
+
if should_validate_dependencies(args):
|
|
698
|
+
validate_dependencies_list(
|
|
699
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
700
|
+
)
|
|
681
701
|
xpk_print('Starting Workload delete', flush=True)
|
|
682
702
|
add_zone_and_project(args)
|
|
683
703
|
get_cluster_credentials(args)
|
|
@@ -725,16 +745,13 @@ def workload_delete(args) -> None:
|
|
|
725
745
|
|
|
726
746
|
# Not batching deletion for single workload
|
|
727
747
|
if len(workloads) == 1:
|
|
728
|
-
return_code = run_command_with_updates(
|
|
729
|
-
commands[0], 'Delete Workload', args
|
|
730
|
-
)
|
|
748
|
+
return_code = run_command_with_updates(commands[0], 'Delete Workload')
|
|
731
749
|
else:
|
|
732
750
|
return_code = run_commands(
|
|
733
751
|
commands,
|
|
734
752
|
'Delete Workload',
|
|
735
753
|
task_names,
|
|
736
754
|
batch=100,
|
|
737
|
-
dry_run=args.dry_run,
|
|
738
755
|
)
|
|
739
756
|
|
|
740
757
|
if return_code != 0:
|
|
@@ -752,6 +769,10 @@ def workload_list(args) -> None:
|
|
|
752
769
|
Returns:
|
|
753
770
|
0 if successful and 1 otherwise.
|
|
754
771
|
"""
|
|
772
|
+
if should_validate_dependencies(args):
|
|
773
|
+
validate_dependencies_list(
|
|
774
|
+
[SystemDependency.KUBECTL, SystemDependency.GCLOUD]
|
|
775
|
+
)
|
|
755
776
|
xpk_print('Starting workload list', flush=True)
|
|
756
777
|
add_zone_and_project(args)
|
|
757
778
|
get_cluster_credentials(args)
|