xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/batch.py +27 -5
- xpk/commands/cluster.py +104 -80
- xpk/commands/cluster_gcluster.py +94 -10
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +8 -10
- xpk/commands/inspector.py +5 -11
- xpk/commands/job.py +9 -7
- xpk/commands/kind.py +34 -4
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +27 -7
- xpk/commands/storage.py +280 -0
- xpk/commands/version.py +6 -18
- xpk/commands/workload.py +381 -184
- xpk/core/blueprint/blueprint_definitions.py +1 -0
- xpk/core/blueprint/blueprint_generator.py +132 -76
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +6 -3
- xpk/core/commands.py +18 -14
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +20 -2
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +257 -18
- xpk/core/kueue.py +12 -6
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +32 -20
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +124 -45
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +38 -1
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +209 -1
- xpk/core/workload_decorators/rdma_decorator.py +25 -5
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
- xpk/main.py +3 -1
- xpk/parser/batch.py +10 -151
- xpk/parser/cluster.py +49 -8
- xpk/parser/common.py +189 -1
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +27 -1
- xpk/parser/info.py +2 -1
- xpk/parser/inspector.py +3 -3
- xpk/parser/job.py +25 -4
- xpk/parser/kind.py +3 -2
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +10 -1
- xpk/parser/storage.py +326 -0
- xpk/parser/validators.py +3 -3
- xpk/parser/workload.py +118 -76
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/objects.py +8 -5
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
- xpk-0.7.1.dist-info/RECORD +92 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
- xpk/core/core.py +0 -2824
- xpk-0.6.0.dist-info/RECORD +0 -57
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
- {xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0
xpk/commands/info.py
CHANGED
|
@@ -14,19 +14,17 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
18
|
-
from ..core.kueue import verify_kueuectl
|
|
19
|
-
from .cluster import set_cluster_command
|
|
20
|
-
from ..core.commands import (
|
|
21
|
-
run_command_for_value,
|
|
22
|
-
)
|
|
23
|
-
from ..core.core import (
|
|
24
|
-
add_zone_and_project,
|
|
25
|
-
)
|
|
26
17
|
import json
|
|
27
|
-
from tabulate import tabulate
|
|
28
18
|
from argparse import Namespace
|
|
29
19
|
|
|
20
|
+
from tabulate import tabulate
|
|
21
|
+
|
|
22
|
+
from ..core.commands import run_command_for_value
|
|
23
|
+
from ..core.gcloud_context import add_zone_and_project
|
|
24
|
+
from ..core.kueue import verify_kueuectl
|
|
25
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
+
from .common import set_cluster_command
|
|
27
|
+
|
|
30
28
|
table_fmt = 'plain'
|
|
31
29
|
|
|
32
30
|
|
xpk/commands/inspector.py
CHANGED
|
@@ -14,17 +14,13 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from ..core.cluster import get_cluster_credentials
|
|
17
18
|
from ..core.commands import run_command_for_value
|
|
18
|
-
from ..core.
|
|
19
|
-
CLUSTER_METADATA_CONFIGMAP,
|
|
20
|
-
CLUSTER_RESOURCES_CONFIGMAP,
|
|
21
|
-
add_zone_and_project,
|
|
22
|
-
zone_to_region,
|
|
23
|
-
)
|
|
19
|
+
from ..core.gcloud_context import add_zone_and_project, zone_to_region
|
|
24
20
|
from ..core.kueue import CLUSTER_QUEUE_NAME, LOCAL_QUEUE_NAME
|
|
25
|
-
from ..
|
|
21
|
+
from ..core.resources import CLUSTER_METADATA_CONFIGMAP, CLUSTER_RESOURCES_CONFIGMAP
|
|
26
22
|
from ..utils.console import xpk_exit, xpk_print
|
|
27
|
-
from .
|
|
23
|
+
from ..utils.file import append_tmp_file, write_tmp_file
|
|
28
24
|
from .workload import get_workload_list
|
|
29
25
|
|
|
30
26
|
|
|
@@ -125,9 +121,7 @@ def inspector(args) -> None:
|
|
|
125
121
|
xpk_print(args)
|
|
126
122
|
|
|
127
123
|
add_zone_and_project(args)
|
|
128
|
-
|
|
129
|
-
if set_cluster_command_code != 0:
|
|
130
|
-
xpk_exit(set_cluster_command_code)
|
|
124
|
+
get_cluster_credentials(args)
|
|
131
125
|
|
|
132
126
|
inspector_file = write_tmp_file(
|
|
133
127
|
'==================\nXPK inspector OUTPUT:\n==================\n'
|
xpk/commands/job.py
CHANGED
|
@@ -14,16 +14,18 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from .cluster import set_cluster_command
|
|
18
|
-
from .kind import set_local_cluster_command
|
|
19
|
-
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
20
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
21
|
-
from ..core.kjob import AppProfileDefaults
|
|
22
|
-
from ..core.core import add_zone_and_project
|
|
23
|
-
from ruamel.yaml import YAML
|
|
24
17
|
import re
|
|
25
18
|
import sys
|
|
26
19
|
|
|
20
|
+
from ruamel.yaml import YAML
|
|
21
|
+
|
|
22
|
+
from ..core.commands import run_command_for_value, run_command_with_updates
|
|
23
|
+
from ..core.gcloud_context import add_zone_and_project
|
|
24
|
+
from ..core.kjob import AppProfileDefaults
|
|
25
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
26
|
+
from .common import set_cluster_command
|
|
27
|
+
from .kind import set_local_cluster_command
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
def job_info(args):
|
|
29
31
|
"""Run commands obtaining information about a job given by name.
|
xpk/commands/kind.py
CHANGED
|
@@ -18,9 +18,7 @@ from ..core.commands import (
|
|
|
18
18
|
run_command_for_value,
|
|
19
19
|
run_command_with_updates,
|
|
20
20
|
)
|
|
21
|
-
from ..core.
|
|
22
|
-
set_jobset_on_cluster,
|
|
23
|
-
)
|
|
21
|
+
from ..core.cluster import set_jobset_on_cluster, setup_k8s_env
|
|
24
22
|
from ..core.kjob import (
|
|
25
23
|
verify_kjob_installed,
|
|
26
24
|
prepare_kjob,
|
|
@@ -28,6 +26,13 @@ from ..core.kjob import (
|
|
|
28
26
|
)
|
|
29
27
|
from ..core.kueue import (
|
|
30
28
|
install_kueue_on_cluster,
|
|
29
|
+
install_kueue_crs,
|
|
30
|
+
wait_for_kueue_available,
|
|
31
|
+
)
|
|
32
|
+
from ..core.storage import install_storage_crd
|
|
33
|
+
from ..core.system_characteristics import (
|
|
34
|
+
SystemCharacteristics,
|
|
35
|
+
AcceleratorType,
|
|
31
36
|
)
|
|
32
37
|
from ..utils.console import (xpk_exit, xpk_print)
|
|
33
38
|
|
|
@@ -74,11 +79,36 @@ def cluster_create(args) -> None:
|
|
|
74
79
|
if err_code > 0:
|
|
75
80
|
xpk_exit(err_code)
|
|
76
81
|
|
|
77
|
-
|
|
82
|
+
args.kind_cluster = True
|
|
78
83
|
err_code = prepare_kjob(args)
|
|
79
84
|
if err_code > 0:
|
|
80
85
|
xpk_exit(err_code)
|
|
81
86
|
|
|
87
|
+
k8s_client = setup_k8s_env(args)
|
|
88
|
+
install_storage_crd(k8s_client)
|
|
89
|
+
|
|
90
|
+
xpk_print('Wait for Kueue to be fully available')
|
|
91
|
+
wait_for_kueue_available_code = wait_for_kueue_available(args)
|
|
92
|
+
if wait_for_kueue_available_code != 0:
|
|
93
|
+
xpk_exit(wait_for_kueue_available_code)
|
|
94
|
+
|
|
95
|
+
args.num_slices = 1
|
|
96
|
+
args.enable_pathways = False
|
|
97
|
+
system = SystemCharacteristics(
|
|
98
|
+
'N/A',
|
|
99
|
+
1,
|
|
100
|
+
'N/A',
|
|
101
|
+
'N/A',
|
|
102
|
+
1,
|
|
103
|
+
AcceleratorType['CPU'],
|
|
104
|
+
'kind',
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
xpk_print('Install Kueue Custom Resources')
|
|
108
|
+
enable_kueue_credentials_code = install_kueue_crs(args, system, None)
|
|
109
|
+
if enable_kueue_credentials_code != 0:
|
|
110
|
+
xpk_exit(enable_kueue_credentials_code)
|
|
111
|
+
|
|
82
112
|
xpk_print('Kind commands done! Resources are created.')
|
|
83
113
|
xpk_exit(0)
|
|
84
114
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..core.kjob import get_a3mega_pod_template_annotations, get_a3ultra_pod_template_annotations
|
|
18
|
+
from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
|
|
19
|
+
from ..core.cluster import get_gpu_type_from_cluster
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def add_tcpxo_annotations(args, cmd: str) -> str:
|
|
23
|
+
tcpxo, interfaces, eth0 = get_a3mega_pod_template_annotations(args)
|
|
24
|
+
cmd += f" --pod-template-annotation {tcpxo} \\\n"
|
|
25
|
+
cmd += f" --pod-template-annotation {eth0} \\\n"
|
|
26
|
+
cmd += f" --pod-template-annotation {interfaces} "
|
|
27
|
+
return cmd
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def add_rdma_annotations(args, cmd) -> str:
|
|
31
|
+
eth0, interfaces = get_a3ultra_pod_template_annotations(args)
|
|
32
|
+
cmd += f" --pod-template-annotation {eth0} \\\n"
|
|
33
|
+
cmd += f" --pod-template-annotation {interfaces} \\\n"
|
|
34
|
+
return cmd
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
|
|
38
|
+
gpu_type = get_gpu_type_from_cluster(args)
|
|
39
|
+
|
|
40
|
+
if gpu_type == H100_MEGA_DEVICE_TYPE:
|
|
41
|
+
return add_tcpxo_annotations(args, cmd)
|
|
42
|
+
if gpu_type == H200_DEVICE_TYPE:
|
|
43
|
+
return add_rdma_annotations(args, cmd)
|
|
44
|
+
return cmd
|
xpk/commands/run.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
|
|
19
|
+
from ..core.cluster import create_xpk_k8s_service_account
|
|
20
|
+
from ..core.commands import run_command_with_full_controls
|
|
21
|
+
from ..core.gcloud_context import add_zone_and_project
|
|
22
|
+
from ..core.kueue import LOCAL_QUEUE_NAME
|
|
23
|
+
from ..utils.console import xpk_exit, xpk_print
|
|
24
|
+
from .common import set_cluster_command
|
|
25
|
+
from ..core.kjob import JobTemplateDefaults, AppProfileDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
|
|
26
|
+
from .kjob_common import add_gpu_networking_annotations_to_command
|
|
27
|
+
from .kind import set_local_cluster_command
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def run(args: Namespace) -> None:
|
|
31
|
+
"""Run task.
|
|
32
|
+
This function runs passed script in non-blocking manner.
|
|
33
|
+
Args:
|
|
34
|
+
args: user provided arguments for running the command.
|
|
35
|
+
Returns:
|
|
36
|
+
None
|
|
37
|
+
"""
|
|
38
|
+
if not args.kind_cluster:
|
|
39
|
+
add_zone_and_project(args)
|
|
40
|
+
set_cluster_command_code = set_cluster_command(args)
|
|
41
|
+
else:
|
|
42
|
+
set_cluster_command_code = set_local_cluster_command(args)
|
|
43
|
+
|
|
44
|
+
if set_cluster_command_code != 0:
|
|
45
|
+
xpk_exit(set_cluster_command_code)
|
|
46
|
+
|
|
47
|
+
err_code = prepare_kjob(args)
|
|
48
|
+
if err_code > 0:
|
|
49
|
+
xpk_exit(err_code)
|
|
50
|
+
create_xpk_k8s_service_account()
|
|
51
|
+
|
|
52
|
+
submit_job(args)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def submit_job(args: Namespace) -> None:
|
|
56
|
+
cmd = (
|
|
57
|
+
'kubectl kjob create slurm --profile'
|
|
58
|
+
f' {AppProfileDefaults.NAME.value} '
|
|
59
|
+
f' --localqueue {LOCAL_QUEUE_NAME} '
|
|
60
|
+
f" --pod-template-annotation '{Kueue_TAS_annotation}'"
|
|
61
|
+
f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
62
|
+
f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
|
|
63
|
+
' --wait --rm --first-node-ip'
|
|
64
|
+
)
|
|
65
|
+
cmd = add_gpu_networking_annotations_to_command(args, cmd)
|
|
66
|
+
|
|
67
|
+
gcsfuse_annotation = get_gcsfuse_annotation(args)
|
|
68
|
+
if gcsfuse_annotation is not None:
|
|
69
|
+
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
70
|
+
|
|
71
|
+
if args.timeout:
|
|
72
|
+
cmd += f' --wait-timeout {args.timeout}s'
|
|
73
|
+
|
|
74
|
+
if args.ignore_unknown_flags:
|
|
75
|
+
cmd += ' --ignore-unknown-flags'
|
|
76
|
+
|
|
77
|
+
cmd += f' -- {args.script} --partition {LOCAL_QUEUE_NAME}'
|
|
78
|
+
|
|
79
|
+
if args.array is not None:
|
|
80
|
+
cmd += f' --array {args.array}'
|
|
81
|
+
|
|
82
|
+
if args.cpus_per_task is not None:
|
|
83
|
+
cmd += f' --cpus-per-task {args.cpus_per_task}'
|
|
84
|
+
|
|
85
|
+
if args.gpus_per_task is not None:
|
|
86
|
+
cmd += f' --gpus-per-task {args.gpus_per_task}'
|
|
87
|
+
|
|
88
|
+
if args.mem is not None:
|
|
89
|
+
cmd += f' --mem {args.mem}'
|
|
90
|
+
|
|
91
|
+
if args.mem_per_task is not None:
|
|
92
|
+
cmd += f' --mem-per-task {args.mem_per_task}'
|
|
93
|
+
|
|
94
|
+
if args.mem_per_cpu is not None:
|
|
95
|
+
cmd += f' --mem-per-cpu {args.mem_per_cpu}'
|
|
96
|
+
|
|
97
|
+
if args.mem_per_gpu is not None:
|
|
98
|
+
cmd += f' --mem-per-gpu {args.mem_per_gpu}'
|
|
99
|
+
|
|
100
|
+
if args.nodes is not None:
|
|
101
|
+
cmd += f' --nodes {args.nodes}'
|
|
102
|
+
|
|
103
|
+
if args.ntasks is not None:
|
|
104
|
+
cmd += f' --ntasks {args.ntasks}'
|
|
105
|
+
|
|
106
|
+
if args.output is not None:
|
|
107
|
+
cmd += f' --output {args.output}'
|
|
108
|
+
|
|
109
|
+
if args.error is not None:
|
|
110
|
+
cmd += f' --error {args.error}'
|
|
111
|
+
|
|
112
|
+
if args.input is not None:
|
|
113
|
+
cmd += f' --input {args.input}'
|
|
114
|
+
|
|
115
|
+
if args.job_name is not None:
|
|
116
|
+
cmd += f' --job-name {args.job_name}'
|
|
117
|
+
|
|
118
|
+
if args.chdir is not None:
|
|
119
|
+
cmd += f' --chdir {args.chdir}'
|
|
120
|
+
|
|
121
|
+
if args.time is not None:
|
|
122
|
+
cmd += f' --time {args.time}'
|
|
123
|
+
|
|
124
|
+
return_code = run_command_with_full_controls(cmd, 'run task', args)
|
|
125
|
+
|
|
126
|
+
if return_code != 0:
|
|
127
|
+
xpk_print(f'Running task returned ERROR {return_code}')
|
|
128
|
+
xpk_exit(return_code)
|
xpk/commands/shell.py
CHANGED
|
@@ -12,11 +12,16 @@ limitations under the License.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
|
|
15
|
+
from ..core.cluster import get_cluster_credentials, add_zone_and_project, create_xpk_k8s_service_account
|
|
15
16
|
from ..utils.console import xpk_exit, xpk_print
|
|
16
17
|
from argparse import Namespace
|
|
17
18
|
|
|
18
|
-
from ..core.kjob import
|
|
19
|
-
|
|
19
|
+
from ..core.kjob import (
|
|
20
|
+
AppProfileDefaults,
|
|
21
|
+
prepare_kjob,
|
|
22
|
+
get_pod_template_interactive_command,
|
|
23
|
+
get_gcsfuse_annotation,
|
|
24
|
+
)
|
|
20
25
|
|
|
21
26
|
exit_instructions = 'To exit the shell input "exit".'
|
|
22
27
|
|
|
@@ -45,6 +50,10 @@ def shell(args: Namespace):
|
|
|
45
50
|
|
|
46
51
|
|
|
47
52
|
def get_existing_shell_pod_name(args: Namespace) -> str | None:
|
|
53
|
+
if not args.kind_cluster:
|
|
54
|
+
add_zone_and_project(args)
|
|
55
|
+
get_cluster_credentials(args)
|
|
56
|
+
|
|
48
57
|
return_code, shell_name = run_command_for_value(
|
|
49
58
|
command=(
|
|
50
59
|
'kubectl get pods --no-headers --field-selector status.phase=Running'
|
|
@@ -70,11 +79,22 @@ def get_existing_shell_pod_name(args: Namespace) -> str | None:
|
|
|
70
79
|
|
|
71
80
|
|
|
72
81
|
def connect_to_new_interactive_shell(args: Namespace) -> int:
|
|
82
|
+
err_code = prepare_kjob(args)
|
|
83
|
+
if err_code > 0:
|
|
84
|
+
xpk_exit(err_code)
|
|
85
|
+
create_xpk_k8s_service_account()
|
|
86
|
+
|
|
87
|
+
cmd = (
|
|
88
|
+
'kubectl-kjob create interactive --profile'
|
|
89
|
+
f' {AppProfileDefaults.NAME.value} --pod-running-timeout 180s'
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
gcsfuse_annotation = get_gcsfuse_annotation(args)
|
|
93
|
+
if gcsfuse_annotation is not None:
|
|
94
|
+
cmd += f' --pod-template-annotation {gcsfuse_annotation}'
|
|
95
|
+
|
|
73
96
|
return run_command_with_full_controls(
|
|
74
|
-
command=
|
|
75
|
-
'kubectl-kjob create interactive --profile'
|
|
76
|
-
f' {AppProfileDefaults.NAME.value} --pod-running-timeout 30s'
|
|
77
|
-
),
|
|
97
|
+
command=cmd,
|
|
78
98
|
task='Creating new interactive shell and entering it',
|
|
79
99
|
global_args=args,
|
|
80
100
|
instructions=exit_instructions,
|
|
@@ -87,7 +107,7 @@ def connect_to_existing_interactive_shell(
|
|
|
87
107
|
return run_command_with_full_controls(
|
|
88
108
|
command=(
|
|
89
109
|
f'kubectl exec --stdin --tty {pod_name} --'
|
|
90
|
-
f' {
|
|
110
|
+
f' {get_pod_template_interactive_command()}'
|
|
91
111
|
),
|
|
92
112
|
task='Entering existing interactive shell',
|
|
93
113
|
global_args=args,
|
xpk/commands/storage.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from argparse import Namespace
|
|
18
|
+
|
|
19
|
+
import yaml
|
|
20
|
+
from kubernetes import client as k8s_client
|
|
21
|
+
from kubernetes.client import ApiClient
|
|
22
|
+
from kubernetes.client.rest import ApiException
|
|
23
|
+
|
|
24
|
+
from ..core import gcsfuse
|
|
25
|
+
from ..core.cluster import (
|
|
26
|
+
DEFAULT_NAMESPACE,
|
|
27
|
+
add_zone_and_project,
|
|
28
|
+
get_cluster_network,
|
|
29
|
+
setup_k8s_env,
|
|
30
|
+
update_cluster_with_gcpfilestore_driver_if_necessary,
|
|
31
|
+
update_cluster_with_gcsfuse_driver_if_necessary,
|
|
32
|
+
update_cluster_with_workload_identity_if_necessary,
|
|
33
|
+
)
|
|
34
|
+
from ..core.filestore import FilestoreClient, get_storage_class_name
|
|
35
|
+
from ..core.kjob import (
|
|
36
|
+
KJOB_API_GROUP_NAME,
|
|
37
|
+
KJOB_API_GROUP_VERSION,
|
|
38
|
+
KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
39
|
+
create_volume_bundle_instance,
|
|
40
|
+
)
|
|
41
|
+
from ..core.storage import (
|
|
42
|
+
GCP_FILESTORE_TYPE,
|
|
43
|
+
GCS_FUSE_TYPE,
|
|
44
|
+
STORAGE_CRD_PLURAL,
|
|
45
|
+
XPK_API_GROUP_NAME,
|
|
46
|
+
XPK_API_GROUP_VERSION,
|
|
47
|
+
Storage,
|
|
48
|
+
create_storage_crds,
|
|
49
|
+
get_storage,
|
|
50
|
+
list_storages,
|
|
51
|
+
print_storages_for_cluster,
|
|
52
|
+
)
|
|
53
|
+
from ..utils.console import get_user_input, xpk_exit, xpk_print
|
|
54
|
+
from ..utils.kubectl import apply_kubectl_manifest
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def storage_create(args: Namespace) -> None:
|
|
58
|
+
add_zone_and_project(args)
|
|
59
|
+
if args.type == GCP_FILESTORE_TYPE:
|
|
60
|
+
if args.instance is None:
|
|
61
|
+
args.instance = args.name
|
|
62
|
+
|
|
63
|
+
filestore_client = FilestoreClient(args.zone, args.instance, args.project)
|
|
64
|
+
filestore_exists = filestore_client.check_instance_exists()
|
|
65
|
+
if filestore_exists:
|
|
66
|
+
xpk_print(f"Filestore instance {args.instance} already exists.")
|
|
67
|
+
xpk_exit(1)
|
|
68
|
+
filestore_network = get_cluster_network(args)
|
|
69
|
+
xpk_print(
|
|
70
|
+
f"Creating Filestore instance {args.instance} in network:"
|
|
71
|
+
f" {filestore_network}"
|
|
72
|
+
)
|
|
73
|
+
filestore_client.create_instance(
|
|
74
|
+
vol=args.vol, size=args.size, tier=args.tier, network=filestore_network
|
|
75
|
+
)
|
|
76
|
+
if args.manifest is not None:
|
|
77
|
+
with open(args.manifest, "r", encoding="utf-8") as f:
|
|
78
|
+
manifest = list(yaml.safe_load_all(f))
|
|
79
|
+
else:
|
|
80
|
+
manifest = filestore_client.manifest(
|
|
81
|
+
args.name, args.vol, args.access_mode, filestore_network
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
k8s_api_client = setup_k8s_env(args)
|
|
85
|
+
create_storage_crds(k8s_api_client, args, manifest)
|
|
86
|
+
create_volume_bundle_instance(
|
|
87
|
+
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
88
|
+
)
|
|
89
|
+
return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
90
|
+
if return_code > 0:
|
|
91
|
+
xpk_exit(return_code)
|
|
92
|
+
return_code = update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
93
|
+
if return_code > 0:
|
|
94
|
+
xpk_exit(return_code)
|
|
95
|
+
apply_kubectl_manifest(k8s_api_client, manifest)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def storage_delete(args: Namespace) -> None:
|
|
99
|
+
add_zone_and_project(args)
|
|
100
|
+
k8s_api_client = setup_k8s_env(args)
|
|
101
|
+
storages = list_storages(k8s_api_client)
|
|
102
|
+
filestore_client = FilestoreClient(args.zone, args.name, args.project)
|
|
103
|
+
|
|
104
|
+
if not filestore_client.check_instance_exists():
|
|
105
|
+
xpk_print(f"Filestore instance {args.name} does not exist.")
|
|
106
|
+
xpk_exit(1)
|
|
107
|
+
|
|
108
|
+
filestore_instance_name = filestore_client.get_instance_fullname()
|
|
109
|
+
|
|
110
|
+
children = [
|
|
111
|
+
storage
|
|
112
|
+
for storage in storages
|
|
113
|
+
if storage.bucket.startswith(filestore_instance_name)
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
if children and not args.force:
|
|
117
|
+
detach = get_user_input(
|
|
118
|
+
"Deleting a filestore storage will destroy your filestore instance and"
|
|
119
|
+
" all its data in all volumes will be lost. Do you wish to delete the"
|
|
120
|
+
f" filestore instance {filestore_instance_name}?\n y (yes) / n (no):\n'"
|
|
121
|
+
)
|
|
122
|
+
if not detach:
|
|
123
|
+
xpk_print("Deleting storage canceled.")
|
|
124
|
+
xpk_exit(0)
|
|
125
|
+
|
|
126
|
+
for child in children:
|
|
127
|
+
delete_storage_resources(k8s_api_client, child)
|
|
128
|
+
|
|
129
|
+
filestore_client.delete_filestore_instance()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def storage_attach(args: Namespace) -> None:
|
|
133
|
+
add_zone_and_project(args)
|
|
134
|
+
if args.type == GCP_FILESTORE_TYPE:
|
|
135
|
+
if args.instance is None:
|
|
136
|
+
args.instance = args.name
|
|
137
|
+
|
|
138
|
+
filestore_client = FilestoreClient(args.zone, args.instance, args.project)
|
|
139
|
+
|
|
140
|
+
filestore_exists = filestore_client.check_instance_exists()
|
|
141
|
+
if not filestore_exists:
|
|
142
|
+
xpk_print(f"Filestore instance {args.instance} does not exists.")
|
|
143
|
+
xpk_exit(1)
|
|
144
|
+
|
|
145
|
+
if args.manifest is not None:
|
|
146
|
+
with open(args.manifest, "r", encoding="utf-8") as f:
|
|
147
|
+
manifest = list(yaml.safe_load_all(f))
|
|
148
|
+
else:
|
|
149
|
+
filestore_network = get_cluster_network(args)
|
|
150
|
+
manifest = filestore_client.manifest(
|
|
151
|
+
args.name, args.vol, args.access_mode, filestore_network
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
else: # args.type == GCS_FUSE_TYPE:
|
|
155
|
+
if args.manifest is None and args.size is None:
|
|
156
|
+
xpk_print("--size is required when attaching gcsfuse storage.")
|
|
157
|
+
xpk_exit(1)
|
|
158
|
+
|
|
159
|
+
if args.bucket is None:
|
|
160
|
+
args.bucket = args.name
|
|
161
|
+
|
|
162
|
+
if args.manifest is not None:
|
|
163
|
+
with open(args.manifest, "r", encoding="utf-8") as f:
|
|
164
|
+
manifest = list(yaml.safe_load_all(f))
|
|
165
|
+
else:
|
|
166
|
+
manifest = gcsfuse.manifest(
|
|
167
|
+
name=args.name, bucket=args.bucket, size=args.size
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
k8s_api_client = setup_k8s_env(args)
|
|
171
|
+
create_storage_crds(k8s_api_client, args, manifest)
|
|
172
|
+
create_volume_bundle_instance(
|
|
173
|
+
k8s_api_client, args.name, manifest, args.readonly, args.mount_point
|
|
174
|
+
)
|
|
175
|
+
return_code = update_cluster_with_workload_identity_if_necessary(args)
|
|
176
|
+
if return_code > 0:
|
|
177
|
+
xpk_exit(return_code)
|
|
178
|
+
|
|
179
|
+
# args.type can have only two values after parsing
|
|
180
|
+
return_code = (
|
|
181
|
+
update_cluster_with_gcsfuse_driver_if_necessary(args)
|
|
182
|
+
if args.type == GCS_FUSE_TYPE
|
|
183
|
+
else update_cluster_with_gcpfilestore_driver_if_necessary(args)
|
|
184
|
+
)
|
|
185
|
+
if return_code > 0:
|
|
186
|
+
xpk_exit(return_code)
|
|
187
|
+
|
|
188
|
+
apply_kubectl_manifest(k8s_api_client, manifest)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def storage_list(args: Namespace) -> None:
|
|
192
|
+
k8s_api_client = setup_k8s_env(args)
|
|
193
|
+
storages = list_storages(k8s_api_client)
|
|
194
|
+
print_storages_for_cluster(storages)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def storage_detach(args: Namespace) -> None:
|
|
198
|
+
k8s_api_client = setup_k8s_env(args)
|
|
199
|
+
storage = get_storage(k8s_api_client, args.name)
|
|
200
|
+
delete_storage_resources(k8s_api_client, storage)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def delete_resource(api_call, resource_name: str, resource_kind: str) -> None:
|
|
204
|
+
"""
|
|
205
|
+
Deletes a Kubernetes resource and handles potential API exceptions.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
api_call: The function to call for deleting the resource.
|
|
209
|
+
resource_name: The name of the resource to delete.
|
|
210
|
+
resource_type: The type of the resource (e.g., "Persistent Volume Claim").
|
|
211
|
+
"""
|
|
212
|
+
xpk_print(f"Deleting {resource_kind}:{resource_name}")
|
|
213
|
+
try:
|
|
214
|
+
api_call(resource_name)
|
|
215
|
+
except ApiException as e:
|
|
216
|
+
if e.status == 404:
|
|
217
|
+
xpk_print(
|
|
218
|
+
f"{resource_kind}: {resource_name} not found. "
|
|
219
|
+
f"Might be already deleted. Error: {e}"
|
|
220
|
+
)
|
|
221
|
+
return
|
|
222
|
+
else:
|
|
223
|
+
xpk_print(f"Encountered error during {resource_kind} deletion: {e}")
|
|
224
|
+
xpk_exit(1)
|
|
225
|
+
xpk_print(f"Deleted {resource_kind}:{resource_name}")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def delete_storage_resources(k8s_api_client: ApiClient, storage: Storage):
|
|
229
|
+
"""
|
|
230
|
+
Deletes storage PV, PVC, SC and custom resources (if they exist).
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
k8s_api_client: An ApiClient object for interacting with the Kubernetes API.
|
|
234
|
+
storage: Storage to delete
|
|
235
|
+
"""
|
|
236
|
+
api_instance = k8s_client.CustomObjectsApi(k8s_api_client)
|
|
237
|
+
core_api = k8s_client.CoreV1Api()
|
|
238
|
+
storage_api = k8s_client.StorageV1Api()
|
|
239
|
+
|
|
240
|
+
delete_resource(
|
|
241
|
+
lambda name: core_api.delete_namespaced_persistent_volume_claim(
|
|
242
|
+
name, "default"
|
|
243
|
+
),
|
|
244
|
+
storage.pvc,
|
|
245
|
+
"Persistent Volume Claim",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
delete_resource(
|
|
249
|
+
core_api.delete_persistent_volume, storage.pv, "Persistent Volume"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if storage.type == GCP_FILESTORE_TYPE:
|
|
253
|
+
delete_resource(
|
|
254
|
+
storage_api.delete_storage_class,
|
|
255
|
+
get_storage_class_name(storage.name),
|
|
256
|
+
"Storage Class",
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
delete_resource(
|
|
260
|
+
lambda name: api_instance.delete_namespaced_custom_object(
|
|
261
|
+
namespace=DEFAULT_NAMESPACE,
|
|
262
|
+
name=name,
|
|
263
|
+
group=KJOB_API_GROUP_NAME,
|
|
264
|
+
version=KJOB_API_GROUP_VERSION,
|
|
265
|
+
plural=KJOB_API_VOLUME_BUNDLE_PLURAL,
|
|
266
|
+
),
|
|
267
|
+
storage.name,
|
|
268
|
+
"VolumeBundle",
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
delete_resource(
|
|
272
|
+
lambda name: api_instance.delete_cluster_custom_object(
|
|
273
|
+
name=name,
|
|
274
|
+
group=XPK_API_GROUP_NAME,
|
|
275
|
+
version=XPK_API_GROUP_VERSION,
|
|
276
|
+
plural=STORAGE_CRD_PLURAL,
|
|
277
|
+
),
|
|
278
|
+
storage.name,
|
|
279
|
+
"Storage",
|
|
280
|
+
)
|
xpk/commands/version.py
CHANGED
|
@@ -14,26 +14,14 @@ See the License for the specific language governing permissions and
|
|
|
14
14
|
limitations under the License.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
from
|
|
18
|
-
import
|
|
17
|
+
from ..core.config import __version__
|
|
18
|
+
from ..utils.console import xpk_print
|
|
19
19
|
|
|
20
|
-
from ..core.commands import run_command_for_value
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
def get_xpk_version() -> str:
|
|
22
|
+
return __version__
|
|
23
23
|
|
|
24
|
-
from ..utils.console import xpk_exit, xpk_print
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
def version(args: Namespace) -> None:
|
|
25
|
+
def version(args) -> None: # pylint: disable=unused-argument
|
|
28
26
|
"""Get version of xpk."""
|
|
29
|
-
xpk_print('xpk_version:',
|
|
30
|
-
if os.path.exists(os.path.join(os.getcwd(), '.git')):
|
|
31
|
-
code, xpk_version = run_command_for_value(
|
|
32
|
-
'git rev-parse HEAD',
|
|
33
|
-
task='Get latest hash',
|
|
34
|
-
global_args=args,
|
|
35
|
-
quiet=True,
|
|
36
|
-
)
|
|
37
|
-
if code != 0:
|
|
38
|
-
xpk_exit(code)
|
|
39
|
-
xpk_print('git commit:', xpk_version.strip('\n'))
|
|
27
|
+
xpk_print('xpk_version:', __version__)
|