xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/commands/kind.py ADDED
@@ -0,0 +1,283 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.commands import (
18
+ run_command_for_value,
19
+ run_command_with_updates,
20
+ )
21
+ from ..core.cluster import set_jobset_on_cluster, setup_k8s_env
22
+ from ..core.kjob import (
23
+ verify_kjob_installed,
24
+ prepare_kjob,
25
+ apply_kjob_crds,
26
+ )
27
+ from ..core.kueue import (
28
+ install_kueue_on_cluster,
29
+ install_kueue_crs,
30
+ wait_for_kueue_available,
31
+ )
32
+ from ..core.storage import install_storage_crd
33
+ from ..core.system_characteristics import (
34
+ SystemCharacteristics,
35
+ AcceleratorType,
36
+ )
37
+ from ..utils.console import (xpk_exit, xpk_print)
38
+
39
+
40
+ def cluster_create(args) -> None:
41
+ """Function around cluster creation.
42
+
43
+ Args:
44
+ args: user provided arguments for running the command.
45
+
46
+ Returns:
47
+ 0 if successful and 1 otherwise.
48
+ """
49
+ xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
50
+
51
+ create_cluster_command_code = create_cluster_if_necessary(args)
52
+ if create_cluster_command_code != 0:
53
+ xpk_exit(create_cluster_command_code)
54
+
55
+ set_cluster_command_code = set_local_cluster_command(args)
56
+ if set_cluster_command_code != 0:
57
+ xpk_exit(set_cluster_command_code)
58
+
59
+ xpk_print(
60
+ 'Enabling the jobset API on our cluster, to be deprecated when Jobset is'
61
+ ' globally available'
62
+ )
63
+ set_jobset_on_cluster_code = set_jobset_on_cluster(args)
64
+ if set_jobset_on_cluster_code != 0:
65
+ xpk_exit(set_jobset_on_cluster_code)
66
+
67
+ xpk_print('Enabling Kueue on the cluster')
68
+ install_kueue_on_cluster_code = install_kueue_on_cluster(args)
69
+ if install_kueue_on_cluster_code != 0:
70
+ xpk_exit(install_kueue_on_cluster_code)
71
+
72
+ xpk_print('Verifying kjob installation')
73
+ err_code = verify_kjob_installed(args)
74
+ if err_code > 0:
75
+ xpk_exit(err_code)
76
+
77
+ xpk_print('Applying kjob CDRs')
78
+ err_code = apply_kjob_crds(args)
79
+ if err_code > 0:
80
+ xpk_exit(err_code)
81
+
82
+ args.kind_cluster = True
83
+ err_code = prepare_kjob(args)
84
+ if err_code > 0:
85
+ xpk_exit(err_code)
86
+
87
+ k8s_client = setup_k8s_env(args)
88
+ install_storage_crd(k8s_client)
89
+
90
+ xpk_print('Wait for Kueue to be fully available')
91
+ wait_for_kueue_available_code = wait_for_kueue_available(args)
92
+ if wait_for_kueue_available_code != 0:
93
+ xpk_exit(wait_for_kueue_available_code)
94
+
95
+ args.num_slices = 1
96
+ args.enable_pathways = False
97
+ system = SystemCharacteristics(
98
+ 'N/A',
99
+ 1,
100
+ 'N/A',
101
+ 'N/A',
102
+ 1,
103
+ AcceleratorType['CPU'],
104
+ 'kind',
105
+ )
106
+
107
+ xpk_print('Install Kueue Custom Resources')
108
+ enable_kueue_credentials_code = install_kueue_crs(args, system, None)
109
+ if enable_kueue_credentials_code != 0:
110
+ xpk_exit(enable_kueue_credentials_code)
111
+
112
+ xpk_print('Kind commands done! Resources are created.')
113
+ xpk_exit(0)
114
+
115
+
116
+ def cluster_delete(args) -> None:
117
+ """Function around cluster delete.
118
+
119
+ Args:
120
+ args: user provided arguments for running the command.
121
+
122
+ Returns:
123
+ 0 if successful and 1 otherwise.
124
+ """
125
+ xpk_print(f'Starting cluster delete for cluster: {args.cluster}', flush=True)
126
+
127
+ run_kind_cluster_delete_command_code = run_kind_cluster_delete_command(args)
128
+ if run_kind_cluster_delete_command_code != 0:
129
+ xpk_exit(run_kind_cluster_delete_command_code)
130
+ xpk_print(f'Kind commands done! Cluster {args.cluster} deleted.')
131
+ xpk_exit(0)
132
+
133
+
134
+ def cluster_list(args) -> None:
135
+ """Function around cluster list.
136
+
137
+ Args:
138
+ args: user provided arguments for running the command.
139
+
140
+ Returns:
141
+ 0 if successful and 1 otherwise.
142
+ """
143
+ if run_kind_clusters_list_command(args):
144
+ xpk_exit(1)
145
+ xpk_exit(0)
146
+
147
+
148
+ def create_cluster_if_necessary(args) -> int:
149
+ """Creates cluster if not present in the project.
150
+
151
+ Args:
152
+ args: user provided arguments for running the command.
153
+
154
+ Returns:
155
+ 0 if successful and 1 otherwise.
156
+ """
157
+ all_clusters, return_code = get_all_local_clusters_programmatic(args)
158
+ if return_code > 0:
159
+ xpk_print('Listing all clusters failed!')
160
+ return 1
161
+ if args.cluster in all_clusters:
162
+ xpk_print('Skipping cluster creation since it already exists.')
163
+ return 0
164
+ else:
165
+ return run_kind_cluster_create_command(args)
166
+
167
+
168
+ def run_kind_cluster_delete_command(args) -> int:
169
+ """Run the Delete Kind Cluster request.
170
+
171
+ Args:
172
+ args: user provided arguments for running the command.
173
+
174
+ Returns:
175
+ 0 if successful and 1 otherwise.
176
+ """
177
+ command = 'kind delete cluster'
178
+
179
+ if args.cluster:
180
+ command += f' --name={args.cluster}'
181
+
182
+ return_code = run_command_with_updates(command, 'Cluster Delete', args)
183
+ if return_code != 0:
184
+ xpk_print(f'Cluster delete request returned ERROR {return_code}')
185
+ return 1
186
+
187
+ return 0
188
+
189
+
190
+ def run_kind_clusters_list_command(args) -> int:
191
+ """List Kind Clusters within the project and location.
192
+
193
+ Args:
194
+ args: user provided arguments for running the command.
195
+
196
+ Returns:
197
+ 0 if successful and 1 otherwise.
198
+ """
199
+ command = 'kind get clusters'
200
+ return_code = run_command_with_updates(command, 'Cluster List', args)
201
+ if return_code != 0:
202
+ xpk_print(f'Cluster list request returned ERROR {return_code}')
203
+ return 1
204
+
205
+ return 0
206
+
207
+
208
+ def run_kind_cluster_create_command(args) -> int:
209
+ """Run the Create Kind Cluster request.
210
+
211
+ Args:
212
+ args: user provided arguments for running the command.
213
+
214
+ Returns:
215
+ 0 if successful and 1 otherwise.
216
+ """
217
+ command = 'kind create cluster'
218
+
219
+ if args.cluster:
220
+ command += f' --name={args.cluster}'
221
+
222
+ if args.k8s_version:
223
+ command += f' --image=kindest/node:v{args.k8s_version}'
224
+
225
+ return_code = run_command_with_updates(command, 'Kind Cluster Create', args)
226
+ if return_code != 0:
227
+ xpk_print(f'GKE Cluster Create request returned ERROR {return_code}')
228
+ return 1
229
+ return 0
230
+
231
+
232
+ def get_all_local_clusters_programmatic(args) -> tuple[list[str], int]:
233
+ """Gets all the local clusters.
234
+
235
+ Args:
236
+ args: user provided arguments for running the command.
237
+
238
+ Returns:
239
+ List of cluster names and 0 if successful and 1 otherwise.
240
+ """
241
+ command = 'kind get clusters'
242
+ return_code, raw_cluster_output = run_command_for_value(
243
+ command, 'Find if Cluster Exists', args
244
+ )
245
+ if return_code != 0:
246
+ xpk_print(f'Find if Cluster Exists returned ERROR {return_code}')
247
+ return [], return_code
248
+
249
+ return raw_cluster_output.splitlines(), 0
250
+
251
+
252
+ def set_local_cluster_command(args) -> int:
253
+ """Run local cluster configuration command to set the kubectl config.
254
+
255
+ Args:
256
+ args: user provided arguments for running the command.
257
+
258
+ Returns:
259
+ 0 if successful and 1 otherwise.
260
+ """
261
+ if not args.cluster:
262
+ command = 'kubectl config current-context'
263
+ return_code, current_context = run_command_for_value(
264
+ command, 'get current-context', args
265
+ )
266
+ xpk_print(
267
+ 'No local cluster name specified. Using current-context'
268
+ f' `{current_context.strip()}`'
269
+ )
270
+ return return_code
271
+
272
+ command = (
273
+ f'kubectl config use-context kind-{args.cluster} --namespace=default'
274
+ )
275
+ task = f'switch to cluster {args.cluster}'
276
+ return_code = run_command_with_updates(
277
+ command,
278
+ task,
279
+ args,
280
+ )
281
+ if return_code != 0:
282
+ xpk_print(f'{task} returned ERROR {return_code}')
283
+ return return_code
@@ -0,0 +1,44 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..core.kjob import get_a3mega_pod_template_annotations, get_a3ultra_pod_template_annotations
18
+ from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
19
+ from ..core.cluster import get_gpu_type_from_cluster
20
+
21
+
22
+ def add_tcpxo_annotations(args, cmd: str) -> str:
23
+ tcpxo, interfaces, eth0 = get_a3mega_pod_template_annotations(args)
24
+ cmd += f" --pod-template-annotation {tcpxo} \\\n"
25
+ cmd += f" --pod-template-annotation {eth0} \\\n"
26
+ cmd += f" --pod-template-annotation {interfaces} "
27
+ return cmd
28
+
29
+
30
+ def add_rdma_annotations(args, cmd) -> str:
31
+ eth0, interfaces = get_a3ultra_pod_template_annotations(args)
32
+ cmd += f" --pod-template-annotation {eth0} \\\n"
33
+ cmd += f" --pod-template-annotation {interfaces} \\\n"
34
+ return cmd
35
+
36
+
37
+ def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
38
+ gpu_type = get_gpu_type_from_cluster(args)
39
+
40
+ if gpu_type == H100_MEGA_DEVICE_TYPE:
41
+ return add_tcpxo_annotations(args, cmd)
42
+ if gpu_type == H200_DEVICE_TYPE:
43
+ return add_rdma_annotations(args, cmd)
44
+ return cmd
xpk/commands/run.py ADDED
@@ -0,0 +1,128 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from argparse import Namespace
18
+
19
+ from ..core.cluster import create_xpk_k8s_service_account
20
+ from ..core.commands import run_command_with_full_controls
21
+ from ..core.gcloud_context import add_zone_and_project
22
+ from ..core.kueue import LOCAL_QUEUE_NAME
23
+ from ..utils.console import xpk_exit, xpk_print
24
+ from .common import set_cluster_command
25
+ from ..core.kjob import JobTemplateDefaults, AppProfileDefaults, prepare_kjob, Kueue_TAS_annotation, get_gcsfuse_annotation
26
+ from .kjob_common import add_gpu_networking_annotations_to_command
27
+ from .kind import set_local_cluster_command
28
+
29
+
30
+ def run(args: Namespace) -> None:
31
+ """Run task.
32
+ This function runs passed script in non-blocking manner.
33
+ Args:
34
+ args: user provided arguments for running the command.
35
+ Returns:
36
+ None
37
+ """
38
+ if not args.kind_cluster:
39
+ add_zone_and_project(args)
40
+ set_cluster_command_code = set_cluster_command(args)
41
+ else:
42
+ set_cluster_command_code = set_local_cluster_command(args)
43
+
44
+ if set_cluster_command_code != 0:
45
+ xpk_exit(set_cluster_command_code)
46
+
47
+ err_code = prepare_kjob(args)
48
+ if err_code > 0:
49
+ xpk_exit(err_code)
50
+ create_xpk_k8s_service_account()
51
+
52
+ submit_job(args)
53
+
54
+
55
+ def submit_job(args: Namespace) -> None:
56
+ cmd = (
57
+ 'kubectl kjob create slurm --profile'
58
+ f' {AppProfileDefaults.NAME.value} '
59
+ f' --localqueue {LOCAL_QUEUE_NAME} '
60
+ f" --pod-template-annotation '{Kueue_TAS_annotation}'"
61
+ f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
62
+ f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
63
+ ' --wait --rm --first-node-ip'
64
+ )
65
+ cmd = add_gpu_networking_annotations_to_command(args, cmd)
66
+
67
+ gcsfuse_annotation = get_gcsfuse_annotation(args)
68
+ if gcsfuse_annotation is not None:
69
+ cmd += f' --pod-template-annotation {gcsfuse_annotation}'
70
+
71
+ if args.timeout:
72
+ cmd += f' --wait-timeout {args.timeout}s'
73
+
74
+ if args.ignore_unknown_flags:
75
+ cmd += ' --ignore-unknown-flags'
76
+
77
+ cmd += f' -- {args.script} --partition {LOCAL_QUEUE_NAME}'
78
+
79
+ if args.array is not None:
80
+ cmd += f' --array {args.array}'
81
+
82
+ if args.cpus_per_task is not None:
83
+ cmd += f' --cpus-per-task {args.cpus_per_task}'
84
+
85
+ if args.gpus_per_task is not None:
86
+ cmd += f' --gpus-per-task {args.gpus_per_task}'
87
+
88
+ if args.mem is not None:
89
+ cmd += f' --mem {args.mem}'
90
+
91
+ if args.mem_per_task is not None:
92
+ cmd += f' --mem-per-task {args.mem_per_task}'
93
+
94
+ if args.mem_per_cpu is not None:
95
+ cmd += f' --mem-per-cpu {args.mem_per_cpu}'
96
+
97
+ if args.mem_per_gpu is not None:
98
+ cmd += f' --mem-per-gpu {args.mem_per_gpu}'
99
+
100
+ if args.nodes is not None:
101
+ cmd += f' --nodes {args.nodes}'
102
+
103
+ if args.ntasks is not None:
104
+ cmd += f' --ntasks {args.ntasks}'
105
+
106
+ if args.output is not None:
107
+ cmd += f' --output {args.output}'
108
+
109
+ if args.error is not None:
110
+ cmd += f' --error {args.error}'
111
+
112
+ if args.input is not None:
113
+ cmd += f' --input {args.input}'
114
+
115
+ if args.job_name is not None:
116
+ cmd += f' --job-name {args.job_name}'
117
+
118
+ if args.chdir is not None:
119
+ cmd += f' --chdir {args.chdir}'
120
+
121
+ if args.time is not None:
122
+ cmd += f' --time {args.time}'
123
+
124
+ return_code = run_command_with_full_controls(cmd, 'run task', args)
125
+
126
+ if return_code != 0:
127
+ xpk_print(f'Running task returned ERROR {return_code}')
128
+ xpk_exit(return_code)
xpk/commands/shell.py ADDED
@@ -0,0 +1,140 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+ https://www.apache.org/licenses/LICENSE-2.0
7
+ Unless required by applicable law or agreed to in writing, software
8
+ distributed under the License is distributed on an "AS IS" BASIS,
9
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ See the License for the specific language governing permissions and
11
+ limitations under the License.
12
+ """
13
+
14
+ from ..core.commands import run_command_with_full_controls, run_command_for_value, run_command_with_updates
15
+ from ..core.cluster import get_cluster_credentials, add_zone_and_project, create_xpk_k8s_service_account
16
+ from ..utils.console import xpk_exit, xpk_print
17
+ from argparse import Namespace
18
+
19
+ from ..core.kjob import (
20
+ AppProfileDefaults,
21
+ prepare_kjob,
22
+ get_pod_template_interactive_command,
23
+ get_gcsfuse_annotation,
24
+ )
25
+
26
+ exit_instructions = 'To exit the shell input "exit".'
27
+
28
+
29
+ def shell(args: Namespace):
30
+ """Enter interactive shell.
31
+ Args:
32
+ args: user provided arguments for running the command.
33
+ Returns:
34
+ 0 if successful and 1 otherwise.
35
+ """
36
+ exisitng_shell_pod_name = get_existing_shell_pod_name(args)
37
+
38
+ if exisitng_shell_pod_name is None:
39
+ return_code = connect_to_new_interactive_shell(args)
40
+ else:
41
+ return_code = connect_to_existing_interactive_shell(
42
+ exisitng_shell_pod_name, args
43
+ )
44
+
45
+ if return_code != 0:
46
+ xpk_print(f'The command failed with code {return_code}.')
47
+ xpk_exit(return_code)
48
+
49
+ xpk_exit(0)
50
+
51
+
52
+ def get_existing_shell_pod_name(args: Namespace) -> str | None:
53
+ if not args.kind_cluster:
54
+ add_zone_and_project(args)
55
+ get_cluster_credentials(args)
56
+
57
+ return_code, shell_name = run_command_for_value(
58
+ command=(
59
+ 'kubectl get pods --no-headers --field-selector status.phase=Running'
60
+ ' -o custom-columns=":metadata.name"'
61
+ ),
62
+ task='Get existing interactive shell pod name.',
63
+ global_args=args,
64
+ )
65
+ if return_code != 0:
66
+ xpk_print(
67
+ f'Encounter an error with a code {return_code} when checking for'
68
+ ' existing running shell.'
69
+ )
70
+ xpk_exit(return_code)
71
+
72
+ pod_names = shell_name.strip().split('\n')
73
+ kjob_pod_names = [
74
+ name for name in pod_names if AppProfileDefaults.NAME.value in name
75
+ ]
76
+ shell_pod_names = [name for name in kjob_pod_names if 'interactive' in name]
77
+
78
+ return shell_pod_names[0] if shell_pod_names else None
79
+
80
+
81
+ def connect_to_new_interactive_shell(args: Namespace) -> int:
82
+ err_code = prepare_kjob(args)
83
+ if err_code > 0:
84
+ xpk_exit(err_code)
85
+ create_xpk_k8s_service_account()
86
+
87
+ cmd = (
88
+ 'kubectl-kjob create interactive --profile'
89
+ f' {AppProfileDefaults.NAME.value} --pod-running-timeout 180s'
90
+ )
91
+
92
+ gcsfuse_annotation = get_gcsfuse_annotation(args)
93
+ if gcsfuse_annotation is not None:
94
+ cmd += f' --pod-template-annotation {gcsfuse_annotation}'
95
+
96
+ return run_command_with_full_controls(
97
+ command=cmd,
98
+ task='Creating new interactive shell and entering it',
99
+ global_args=args,
100
+ instructions=exit_instructions,
101
+ )
102
+
103
+
104
+ def connect_to_existing_interactive_shell(
105
+ pod_name: str, args: Namespace
106
+ ) -> int:
107
+ return run_command_with_full_controls(
108
+ command=(
109
+ f'kubectl exec --stdin --tty {pod_name} --'
110
+ f' {get_pod_template_interactive_command()}'
111
+ ),
112
+ task='Entering existing interactive shell',
113
+ global_args=args,
114
+ instructions=exit_instructions,
115
+ )
116
+
117
+
118
+ def shell_stop(args: Namespace):
119
+ """Stop the running interactive shell by deleting the pod.
120
+ Args:
121
+ args: user provided arguments for running the command.
122
+ Returns:
123
+ 0 if successful and 1 otherwise.
124
+ """
125
+ exisitng_shell_pod_name = get_existing_shell_pod_name(args)
126
+
127
+ if exisitng_shell_pod_name is None:
128
+ xpk_print('There is no shell running to stop')
129
+ xpk_exit(0)
130
+
131
+ return_code = run_command_with_updates(
132
+ command=f'kubectl delete pod {exisitng_shell_pod_name}',
133
+ task='Deleting the existing shell.',
134
+ global_args=args,
135
+ )
136
+ if return_code != 0:
137
+ xpk_exit(return_code)
138
+
139
+ xpk_print('The shell was deleted successfully.')
140
+ xpk_exit(0)