xpk 0.7.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/batch.py +19 -13
- xpk/commands/cluster.py +240 -71
- xpk/commands/cluster_gcluster.py +22 -5
- xpk/commands/common.py +33 -1
- xpk/commands/info.py +2 -4
- xpk/commands/job.py +7 -8
- xpk/commands/kjob_common.py +30 -18
- xpk/commands/run.py +17 -12
- xpk/commands/shell.py +3 -4
- xpk/commands/storage.py +75 -19
- xpk/commands/workload.py +161 -324
- xpk/core/blueprint/blueprint_definitions.py +2 -0
- xpk/core/blueprint/blueprint_generator.py +335 -45
- xpk/core/capacity.py +1 -0
- xpk/core/cluster.py +193 -12
- xpk/core/config.py +3 -1
- xpk/core/docker_manager.py +1 -1
- xpk/core/docker_resources.py +9 -21
- xpk/core/filestore.py +5 -1
- xpk/core/gcsfuse.py +27 -6
- xpk/core/kjob.py +66 -20
- xpk/core/kueue.py +30 -0
- xpk/core/mtc.py +195 -0
- xpk/core/nap.py +4 -0
- xpk/core/network.py +34 -22
- xpk/core/nodepool.py +28 -26
- xpk/core/pathways.py +165 -210
- xpk/core/resources.py +21 -0
- xpk/core/scheduling.py +36 -0
- xpk/core/storage.py +66 -12
- xpk/core/system_characteristics.py +9 -0
- xpk/core/workload.py +28 -83
- xpk/core/workload_decorators/rdma_decorator.py +11 -15
- xpk/core/workload_decorators/storage_decorator.py +8 -3
- xpk/core/workload_decorators/tcpx_decorator.py +179 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +17 -16
- xpk/parser/cluster.py +574 -381
- xpk/parser/storage.py +25 -5
- xpk/parser/workload.py +59 -31
- xpk/utils/kubectl.py +4 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/METADATA +192 -93
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/RECORD +46 -44
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.7.2.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0
xpk/commands/kjob_common.py
CHANGED
|
@@ -14,31 +14,43 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..core.
|
|
18
|
-
|
|
17
|
+
from ..core.capacity import (
|
|
18
|
+
B200_DEVICE_TYPE,
|
|
19
|
+
H100_MEGA_DEVICE_TYPE,
|
|
20
|
+
H200_DEVICE_TYPE,
|
|
21
|
+
)
|
|
19
22
|
from ..core.cluster import get_gpu_type_from_cluster
|
|
23
|
+
from ..core.kjob import (
|
|
24
|
+
get_a3mega_pod_template_annotations,
|
|
25
|
+
get_a3ultra_pod_template_annotations,
|
|
26
|
+
get_a4_pod_template_annotations,
|
|
27
|
+
Kueue_TAS_annotation,
|
|
28
|
+
)
|
|
29
|
+
from .common import is_TAS_possible
|
|
20
30
|
|
|
21
31
|
|
|
22
|
-
def
|
|
23
|
-
|
|
24
|
-
cmd += f" --pod-template-annotation {tcpxo} \\\n"
|
|
25
|
-
cmd += f" --pod-template-annotation {eth0} \\\n"
|
|
26
|
-
cmd += f" --pod-template-annotation {interfaces} "
|
|
27
|
-
return cmd
|
|
32
|
+
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
33
|
+
gpu_type = get_gpu_type_from_cluster(args)
|
|
28
34
|
|
|
35
|
+
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
36
|
+
annotations = get_a3mega_pod_template_annotations(args)
|
|
37
|
+
elif gpu_type == H200_DEVICE_TYPE:
|
|
38
|
+
annotations = get_a3ultra_pod_template_annotations(args)
|
|
39
|
+
elif gpu_type == B200_DEVICE_TYPE:
|
|
40
|
+
annotations = get_a4_pod_template_annotations(args)
|
|
41
|
+
else:
|
|
42
|
+
annotations = []
|
|
43
|
+
|
|
44
|
+
flags = [
|
|
45
|
+
f" --pod-template-annotation {annotation} " for annotation in annotations
|
|
46
|
+
]
|
|
47
|
+
cmd += "\\\n".join(flags)
|
|
29
48
|
|
|
30
|
-
def add_rdma_annotations(args, cmd) -> str:
|
|
31
|
-
eth0, interfaces = get_a3ultra_pod_template_annotations(args)
|
|
32
|
-
cmd += f" --pod-template-annotation {eth0} \\\n"
|
|
33
|
-
cmd += f" --pod-template-annotation {interfaces} \\\n"
|
|
34
49
|
return cmd
|
|
35
50
|
|
|
36
51
|
|
|
37
|
-
def
|
|
38
|
-
|
|
52
|
+
def add_TAS_annotations_to_command(args, cmd: str) -> str:
|
|
53
|
+
if is_TAS_possible(args):
|
|
54
|
+
cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
|
|
39
55
|
|
|
40
|
-
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
41
|
-
return add_tcpxo_annotations(args, cmd)
|
|
42
|
-
if gpu_type == H200_DEVICE_TYPE:
|
|
43
|
-
return add_rdma_annotations(args, cmd)
|
|
44
56
|
return cmd
|
xpk/commands/run.py
CHANGED
|
@@ -16,15 +16,22 @@ limitations under the License.
|
|
|
16
16
|
|
|
17
17
|
from argparse import Namespace
|
|
18
18
|
|
|
19
|
-
from ..core.cluster import
|
|
19
|
+
from ..core.cluster import (
|
|
20
|
+
create_xpk_k8s_service_account,
|
|
21
|
+
get_cluster_credentials,
|
|
22
|
+
)
|
|
20
23
|
from ..core.commands import run_command_with_full_controls
|
|
21
24
|
from ..core.gcloud_context import add_zone_and_project
|
|
25
|
+
from ..core.kjob import (
|
|
26
|
+
AppProfileDefaults,
|
|
27
|
+
JobTemplateDefaults,
|
|
28
|
+
get_storage_annotations,
|
|
29
|
+
prepare_kjob,
|
|
30
|
+
)
|
|
22
31
|
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
23
32
|
from ..utils.console import xpk_exit, xpk_print
|
|
24
|
-
from .common import set_cluster_command
|
|
25
|
-
from ..core.kjob import JobTemplateDefaults, AppProfileDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
|
|
26
|
-
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
27
33
|
from .kind import set_local_cluster_command
|
|
34
|
+
from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
|
|
28
35
|
|
|
29
36
|
|
|
30
37
|
def run(args: Namespace) -> None:
|
|
@@ -37,12 +44,11 @@ def run(args: Namespace) -> None:
|
|
|
37
44
|
"""
|
|
38
45
|
if not args.kind_cluster:
|
|
39
46
|
add_zone_and_project(args)
|
|
40
|
-
|
|
47
|
+
get_cluster_credentials(args)
|
|
41
48
|
else:
|
|
42
49
|
set_cluster_command_code = set_local_cluster_command(args)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
xpk_exit(set_cluster_command_code)
|
|
50
|
+
if set_cluster_command_code != 0:
|
|
51
|
+
xpk_exit(set_cluster_command_code)
|
|
46
52
|
|
|
47
53
|
err_code = prepare_kjob(args)
|
|
48
54
|
if err_code > 0:
|
|
@@ -57,16 +63,15 @@ def submit_job(args: Namespace) -> None:
|
|
|
57
63
|
'kubectl kjob create slurm --profile'
|
|
58
64
|
f' {AppProfileDefaults.NAME.value} '
|
|
59
65
|
f' --localqueue {LOCAL_QUEUE_NAME} '
|
|
60
|
-
f" --pod-template-annotation '{Kueue_TAS_annotation}'"
|
|
61
66
|
f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
62
67
|
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
63
68
|
' --wait --rm --first-node-ip'
|
|
64
69
|
)
|
|
65
70
|
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
71
|
+
cmd = add_TAS_annotations_to_command(args, cmd)
|
|
66
72
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
73
|
+
for annotation in get_storage_annotations(args):
|
|
74
|
+
cmd += f' --pod-template-annotation {annotation}'
|
|
70
75
|
|
|
71
76
|
if args.timeout:
|
|
72
77
|
cmd += f' --wait-timeout {args.timeout}s'
|
xpk/commands/shell.py
CHANGED
|
@@ -20,7 +20,7 @@ from ..core.kjob import (
|
|
|
20
20
|
AppProfileDefaults,
|
|
21
21
|
prepare_kjob,
|
|
22
22
|
get_pod_template_interactive_command,
|
|
23
|
-
|
|
23
|
+
get_storage_annotations,
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
exit_instructions = 'To exit the shell input "exit".'
|
|
@@ -89,9 +89,8 @@ def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
|
89
89
|
f' {AppProfileDefaults.NAME.value} --pod-running-timeout 180s'
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
92
|
+
for annotation in get_storage_annotations(args):
|
|
93
|
+
cmd += f' --pod-template-annotation {annotation}'
|
|
95
94
|
|
|
96
95
|
return run_command_with_full_controls(
|
|
97
96
|
command=cmd,
|
xpk/commands/storage.py
CHANGED
|
@@ -27,6 +27,8 @@ from ..core.cluster import (
|
|
|
27
27
|
add_zone_and_project,
|
|
28
28
|
get_cluster_network,
|
|
29
29
|
setup_k8s_env,
|
|
30
|
+
update_cluster_with_parallelstore_driver_if_necessary,
|
|
31
|
+
update_cluster_with_pd_driver_if_necessary,
|
|
30
32
|
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
31
33
|
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
32
34
|
update_cluster_with_workload_identity_if_necessary,
|
|
@@ -41,6 +43,8 @@ from ..core.kjob import (
|
|
|
41
43
|
from ..core.storage import (
|
|
42
44
|
GCP_FILESTORE_TYPE,
|
|
43
45
|
GCS_FUSE_TYPE,
|
|
46
|
+
GCE_PD_TYPE,
|
|
47
|
+
PARALLELSTORE_TYPE,
|
|
44
48
|
STORAGE_CRD_PLURAL,
|
|
45
49
|
XPK_API_GROUP_NAME,
|
|
46
50
|
XPK_API_GROUP_VERSION,
|
|
@@ -78,7 +82,10 @@ def storage_create(args: Namespace) -> None:
|
|
|
78
82
|
manifest = list(yaml.safe_load_all(f))
|
|
79
83
|
else:
|
|
80
84
|
manifest = filestore_client.manifest(
|
|
81
|
-
args.name,
|
|
85
|
+
args.name,
|
|
86
|
+
args.vol,
|
|
87
|
+
args.access_mode,
|
|
88
|
+
filestore_network,
|
|
82
89
|
)
|
|
83
90
|
|
|
84
91
|
k8s_api_client = setup_k8s_env(args)
|
|
@@ -86,9 +93,10 @@ def storage_create(args: Namespace) -> None:
|
|
|
86
93
|
create_volume_bundle_instance(
|
|
87
94
|
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
88
95
|
)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
96
|
+
# Not required for Filestore. Will be uncommented when adding GCSFuse create
|
|
97
|
+
# return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
98
|
+
# if return_code > 0:
|
|
99
|
+
# xpk_exit(return_code)
|
|
92
100
|
return_code = update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
93
101
|
if return_code > 0:
|
|
94
102
|
xpk_exit(return_code)
|
|
@@ -131,6 +139,7 @@ def storage_delete(args: Namespace) -> None:
|
|
|
131
139
|
|
|
132
140
|
def storage_attach(args: Namespace) -> None:
|
|
133
141
|
add_zone_and_project(args)
|
|
142
|
+
manifest = [{}]
|
|
134
143
|
if args.type == GCP_FILESTORE_TYPE:
|
|
135
144
|
if args.instance is None:
|
|
136
145
|
args.instance = args.name
|
|
@@ -148,10 +157,13 @@ def storage_attach(args: Namespace) -> None:
|
|
|
148
157
|
else:
|
|
149
158
|
filestore_network = get_cluster_network(args)
|
|
150
159
|
manifest = filestore_client.manifest(
|
|
151
|
-
args.name,
|
|
160
|
+
args.name,
|
|
161
|
+
args.vol,
|
|
162
|
+
args.access_mode,
|
|
163
|
+
filestore_network,
|
|
152
164
|
)
|
|
153
165
|
|
|
154
|
-
|
|
166
|
+
elif args.type == GCS_FUSE_TYPE:
|
|
155
167
|
if args.manifest is None and args.size is None:
|
|
156
168
|
xpk_print("--size is required when attaching gcsfuse storage.")
|
|
157
169
|
xpk_exit(1)
|
|
@@ -164,30 +176,65 @@ def storage_attach(args: Namespace) -> None:
|
|
|
164
176
|
manifest = list(yaml.safe_load_all(f))
|
|
165
177
|
else:
|
|
166
178
|
manifest = gcsfuse.manifest(
|
|
167
|
-
|
|
179
|
+
args.name,
|
|
180
|
+
args.bucket,
|
|
181
|
+
args.size,
|
|
182
|
+
args.mount_options,
|
|
183
|
+
args.prefetch_metadata,
|
|
168
184
|
)
|
|
169
185
|
|
|
186
|
+
elif args.type in [PARALLELSTORE_TYPE, GCE_PD_TYPE]:
|
|
187
|
+
if args.manifest is None:
|
|
188
|
+
xpk_print(
|
|
189
|
+
"Parallelstore and PersistentDisk are currently supported only with"
|
|
190
|
+
" --manifest"
|
|
191
|
+
)
|
|
192
|
+
xpk_exit(1)
|
|
193
|
+
|
|
194
|
+
with open(args.manifest, "r", encoding="utf-8") as f:
|
|
195
|
+
manifest = list(yaml.safe_load_all(f))
|
|
196
|
+
|
|
197
|
+
else:
|
|
198
|
+
xpk_print(f"Storage type {args.type} is not supported.")
|
|
199
|
+
xpk_exit(1)
|
|
200
|
+
|
|
170
201
|
k8s_api_client = setup_k8s_env(args)
|
|
171
202
|
create_storage_crds(k8s_api_client, args, manifest)
|
|
172
203
|
create_volume_bundle_instance(
|
|
173
204
|
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
174
205
|
)
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
xpk_exit(return_code)
|
|
178
|
-
|
|
179
|
-
# args.type can have only two values after parsing
|
|
180
|
-
return_code = (
|
|
181
|
-
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
182
|
-
if args.type == GCS_FUSE_TYPE
|
|
183
|
-
else update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
184
|
-
)
|
|
185
|
-
if return_code > 0:
|
|
186
|
-
xpk_exit(return_code)
|
|
206
|
+
|
|
207
|
+
enable_csi_drivers_if_necessary(args)
|
|
187
208
|
|
|
188
209
|
apply_kubectl_manifest(k8s_api_client, manifest)
|
|
189
210
|
|
|
190
211
|
|
|
212
|
+
def enable_csi_drivers_if_necessary(args: Namespace) -> None:
|
|
213
|
+
if args.type == GCS_FUSE_TYPE:
|
|
214
|
+
return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
215
|
+
if return_code > 0:
|
|
216
|
+
xpk_exit(return_code)
|
|
217
|
+
|
|
218
|
+
return_code = update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
219
|
+
if return_code > 0:
|
|
220
|
+
xpk_exit(return_code)
|
|
221
|
+
|
|
222
|
+
if args.type == GCP_FILESTORE_TYPE:
|
|
223
|
+
return_code = update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
224
|
+
if return_code > 0:
|
|
225
|
+
xpk_exit(return_code)
|
|
226
|
+
|
|
227
|
+
if args.type == PARALLELSTORE_TYPE:
|
|
228
|
+
return_code = update_cluster_with_parallelstore_driver_if_necessary(args)
|
|
229
|
+
if return_code > 0:
|
|
230
|
+
xpk_exit(return_code)
|
|
231
|
+
|
|
232
|
+
if args.type == GCE_PD_TYPE:
|
|
233
|
+
return_code = update_cluster_with_pd_driver_if_necessary(args)
|
|
234
|
+
if return_code > 0:
|
|
235
|
+
xpk_exit(return_code)
|
|
236
|
+
|
|
237
|
+
|
|
191
238
|
def storage_list(args: Namespace) -> None:
|
|
192
239
|
k8s_api_client = setup_k8s_env(args)
|
|
193
240
|
storages = list_storages(k8s_api_client)
|
|
@@ -278,3 +325,12 @@ def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
|
|
|
278
325
|
storage.name,
|
|
279
326
|
"Storage",
|
|
280
327
|
)
|
|
328
|
+
|
|
329
|
+
# remove kubernetes.io/pvc-protection
|
|
330
|
+
delete_resource(
|
|
331
|
+
lambda name: core_api.patch_namespaced_persistent_volume_claim(
|
|
332
|
+
name, "default", {"metadata": {"finalizers": None}}
|
|
333
|
+
),
|
|
334
|
+
storage.pvc,
|
|
335
|
+
"Persistent Volume Claim finalizers",
|
|
336
|
+
)
|