xpk 0.17.2__py3-none-any.whl → 0.17.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/batch.py DELETED
@@ -1,144 +0,0 @@
1
- """
2
- Copyright 2024 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- import re
18
- from argparse import Namespace
19
-
20
- from ..core.cluster import (
21
- setup_k8s_service_accounts,
22
- get_cluster_credentials,
23
- )
24
- from ..core.commands import run_command_for_value
25
- from ..core.gcloud_context import add_zone_and_project
26
- from ..core.kjob import (
27
- AppProfileDefaults,
28
- JobTemplateDefaults,
29
- get_storage_annotations,
30
- prepare_kjob,
31
- )
32
- from ..core.kueue_manager import LOCAL_QUEUE_NAME
33
- from ..utils.console import xpk_exit, xpk_print
34
- from ..utils.execution_context import is_dry_run
35
- from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
36
- from .kind import set_local_cluster_command
37
- from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
38
-
39
-
40
- def batch(args: Namespace) -> None:
41
- """Run batch task.
42
- This function runs passed script in non-blocking manner.
43
- Args:
44
- args: user provided arguments for running the command.
45
- Returns:
46
- None
47
- """
48
- if should_validate_dependencies(args):
49
- validate_dependencies_list([
50
- SystemDependency.KUBECTL,
51
- SystemDependency.KJOB,
52
- SystemDependency.GCLOUD,
53
- ])
54
- if not args.kind_cluster:
55
- add_zone_and_project(args)
56
- get_cluster_credentials(args)
57
- else:
58
- set_cluster_command_code = set_local_cluster_command(args)
59
- if set_cluster_command_code != 0:
60
- xpk_exit(set_cluster_command_code)
61
-
62
- if not is_dry_run():
63
- err_code = prepare_kjob(args)
64
- if err_code > 0:
65
- xpk_exit(err_code)
66
- setup_k8s_service_accounts()
67
-
68
- submit_job(args)
69
-
70
-
71
- def submit_job(args: Namespace) -> None:
72
- cmd = (
73
- 'kubectl kjob create slurm'
74
- f' --profile {AppProfileDefaults.NAME.value}'
75
- f' --localqueue {LOCAL_QUEUE_NAME}'
76
- f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
77
- ' --first-node-ip'
78
- )
79
- cmd = add_gpu_networking_annotations_to_command(args, cmd)
80
- cmd = add_TAS_annotations_to_command(args, cmd)
81
-
82
- annotations = [] if is_dry_run() else get_storage_annotations(args)
83
- for annotation in annotations:
84
- cmd += f' --pod-template-annotation {annotation}'
85
-
86
- if args.ignore_unknown_flags:
87
- cmd += ' --ignore-unknown-flags'
88
-
89
- cmd += f' -- {args.script} --partition {LOCAL_QUEUE_NAME}'
90
-
91
- if args.array is not None:
92
- cmd += f' --array {args.array}'
93
-
94
- if args.cpus_per_task is not None:
95
- cmd += f' --cpus-per-task {args.cpus_per_task}'
96
-
97
- if args.gpus_per_task is not None:
98
- cmd += f' --gpus-per-task {args.gpus_per_task}'
99
-
100
- if args.mem is not None:
101
- cmd += f' --mem {args.mem}'
102
-
103
- if args.mem_per_task is not None:
104
- cmd += f' --mem-per-task {args.mem_per_task}'
105
-
106
- if args.mem_per_cpu is not None:
107
- cmd += f' --mem-per-cpu {args.mem_per_cpu}'
108
-
109
- if args.mem_per_gpu is not None:
110
- cmd += f' --mem-per-gpu {args.mem_per_gpu}'
111
-
112
- if args.nodes is not None:
113
- cmd += f' --nodes {args.nodes}'
114
-
115
- if args.ntasks is not None:
116
- cmd += f' --ntasks {args.ntasks}'
117
-
118
- if args.output is not None:
119
- cmd += f' --output {args.output}'
120
-
121
- if args.error is not None:
122
- cmd += f' --error {args.error}'
123
-
124
- if args.input is not None:
125
- cmd += f' --input {args.input}'
126
-
127
- if args.job_name is not None:
128
- cmd += f' --job-name {args.job_name}'
129
-
130
- if args.chdir is not None:
131
- cmd += f' --chdir {args.chdir}'
132
-
133
- if args.time is not None:
134
- cmd += f' --time {args.time}'
135
-
136
- return_code, return_value = run_command_for_value(cmd, 'submit job')
137
-
138
- if return_code != 0:
139
- xpk_print(f'Running batch job returned ERROR {return_code}')
140
- xpk_exit(return_code)
141
-
142
- m = re.match(r'job\.batch/([-a-z0-9]+)', return_value)
143
- if m:
144
- xpk_print(f'Job name: {m.group(1)}')
xpk/commands/job.py DELETED
@@ -1,244 +0,0 @@
1
- """
2
- Copyright 2024 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- import re
18
- import sys
19
-
20
- from ruamel.yaml import YAML
21
- from typing import cast
22
-
23
- from ..core.commands import run_command_for_value, run_command_with_updates
24
- from ..core.cluster import get_cluster_credentials
25
- from ..core.gcloud_context import add_zone_and_project
26
- from ..core.kjob import AppProfileDefaults
27
- from ..utils.console import xpk_exit, xpk_print
28
- from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
29
- from .kind import set_local_cluster_command
30
-
31
-
32
- JOBS_DRY_RUN_YAML = """
33
- items:
34
- - apiVersion: slurm.k8s.io/v1alpha1
35
- kind: SlurmJob
36
- metadata:
37
- annotations:
38
- kjobctl.x-k8s.io/script: echo hello
39
- creationTimestamp: '2024-04-29T12:00:00Z'
40
- labels:
41
- kjobctl.x-k8s.io/app-profile: default
42
- name: golden-job
43
- namespace: default
44
- spec:
45
- script: echo hello
46
- """
47
-
48
- PODS_DRY_RUN_RESULT = """
49
- foo-pod 2/2 Running 0 2d
50
- bar-pod 1/1 Evicted 0 1d
51
- """
52
-
53
-
54
- def job_info(args):
55
- """Run commands obtaining information about a job given by name.
56
-
57
- Args:
58
- args: user provided arguments for running the command.
59
-
60
- Returns:
61
- None
62
- """
63
- if should_validate_dependencies(args):
64
- validate_dependencies_list([
65
- SystemDependency.KUBECTL,
66
- SystemDependency.KJOB,
67
- SystemDependency.GCLOUD,
68
- ])
69
- job_name = args.name
70
-
71
- desc_command = f'kubectl-kjob describe slurm {job_name}'
72
- desc_code, desc_text = run_command_for_value(desc_command, 'Getting job data')
73
- if desc_code != 0:
74
- xpk_print(f'Data info request returned ERROR {desc_code}')
75
- xpk_exit(desc_code)
76
-
77
- job_command = (
78
- 'kubectl-kjob list slurm -o yaml --field-selector'
79
- f' metadata.name=={job_name}'
80
- )
81
- job_code, job_text = run_command_for_value(
82
- job_command,
83
- 'Getting job info',
84
- dry_run_return_val=JOBS_DRY_RUN_YAML,
85
- )
86
- if job_code != 0:
87
- xpk_print(f'Job info request returned ERROR {job_code}')
88
- xpk_exit(job_code)
89
-
90
- pods_command = f'kubectl get pods -l=job-name={job_name} --no-headers'
91
- pods_code, pods_text = run_command_for_value(
92
- pods_command,
93
- 'Getting pods list',
94
- dry_run_return_val=PODS_DRY_RUN_RESULT,
95
- )
96
- if pods_code != 0:
97
- xpk_print(f'Pods list request returned ERROR {pods_code}')
98
- xpk_exit(pods_code)
99
-
100
- yaml = YAML(typ='safe')
101
- job_yaml = yaml.load(job_text)['items'][0]
102
-
103
- output = {
104
- 'Job name': job_name,
105
- 'Script name': get_script_name(job_yaml),
106
- 'Profile': get_profile(job_yaml),
107
- 'Labels': job_yaml.get('metadata').get('labels', []),
108
- 'Mounts': get_mounts(job_yaml),
109
- 'Pods': get_pods(pods_text),
110
- 'Entrypoint environment variables template': get_kjob_env_vars(desc_text),
111
- }
112
-
113
- yaml.default_flow_style = False
114
- yaml.sort_base_mapping_type_on_output = False
115
- yaml.dump(output, sys.stdout)
116
-
117
-
118
- def get_profile(job_yaml: dict) -> str:
119
- containers: list[dict] = (
120
- job_yaml.get('spec', {})
121
- .get('template', {})
122
- .get('spec', {})
123
- .get('containers', [])
124
- )
125
- env_vars = next(iter(containers), {}).get('env', [])
126
- profile = next((x['value'] for x in env_vars if x['name'] == 'PROFILE'), '')
127
- return profile
128
-
129
-
130
- def get_mounts(job_yaml: dict) -> list[dict]:
131
- containers: list[dict] = (
132
- job_yaml.get('spec', {})
133
- .get('template', {})
134
- .get('spec', {})
135
- .get('containers', [])
136
- )
137
- mounts: list[dict] = next(iter(containers), {}).get('volumeMounts', [])
138
- return mounts
139
-
140
-
141
- def get_kjob_env_vars(job_desc_text: str) -> list[tuple[str, str]]:
142
- regex = r'(SLURM_[A-Z_]*=.*)'
143
- search_res = re.findall(regex, job_desc_text)
144
- return search_res
145
-
146
-
147
- def get_pods(pods_text: str) -> list[dict[str, str]]:
148
- pods_lines = pods_text.strip().split('\n')
149
- pods_lines_tokenized = [line.split() for line in pods_lines]
150
- return [
151
- {
152
- 'Name': tokens[0],
153
- 'Status': tokens[2],
154
- }
155
- for tokens in pods_lines_tokenized
156
- ]
157
-
158
-
159
- def get_script_name(job_yaml: dict) -> str | None:
160
- return cast(
161
- str | None,
162
- job_yaml.get('metadata', {})
163
- .get('annotations', {})
164
- .get('kjobctl.x-k8s.io/script', ''),
165
- )
166
-
167
-
168
- def job_list(args) -> None:
169
- """Function around job list.
170
-
171
- Args:
172
- args: user provided arguments for running the command.
173
-
174
- Returns:
175
- None
176
- """
177
- if should_validate_dependencies(args):
178
- validate_dependencies_list([
179
- SystemDependency.KUBECTL,
180
- SystemDependency.KJOB,
181
- SystemDependency.GCLOUD,
182
- ])
183
- if not args.kind_cluster:
184
- add_zone_and_project(args)
185
- get_cluster_credentials(args)
186
- msg = f'Listing jobs for project {args.project} and zone {args.zone}:'
187
- else:
188
- set_cluster_command_code = set_local_cluster_command(args)
189
- msg = 'Listing jobs:'
190
- if set_cluster_command_code != 0:
191
- xpk_exit(set_cluster_command_code)
192
-
193
- xpk_print(msg, flush=True)
194
-
195
- return_code = run_slurm_job_list_command()
196
- xpk_exit(return_code)
197
-
198
-
199
- def run_slurm_job_list_command() -> int:
200
- cmd = f'kubectl-kjob list slurm --profile {AppProfileDefaults.NAME.value}'
201
-
202
- return_code = run_command_with_updates(cmd, 'list jobs')
203
- if return_code != 0:
204
- xpk_print(f'Listing jobs returned ERROR {return_code}')
205
- return return_code
206
-
207
-
208
- def job_cancel(args) -> None:
209
- """Function around job cancel.
210
-
211
- Args:
212
- args: user provided arguments for running the command.
213
-
214
- Returns:
215
- None
216
- """
217
- if should_validate_dependencies(args):
218
- validate_dependencies_list([
219
- SystemDependency.KUBECTL,
220
- SystemDependency.KJOB,
221
- SystemDependency.GCLOUD,
222
- ])
223
-
224
- xpk_print(f'Starting job cancel for job: {args.name}', flush=True)
225
- if not args.kind_cluster:
226
- add_zone_and_project(args)
227
- get_cluster_credentials(args)
228
- else:
229
- set_cluster_command_code = set_local_cluster_command(args)
230
- if set_cluster_command_code != 0:
231
- xpk_exit(set_cluster_command_code)
232
-
233
- return_code = run_slurm_job_delete_command(args)
234
- xpk_exit(return_code)
235
-
236
-
237
- def run_slurm_job_delete_command(args) -> int:
238
- list_of_jobs = ' '.join(args.name)
239
- cmd = f'kubectl-kjob delete slurm {list_of_jobs}'
240
-
241
- return_code = run_command_with_updates(cmd, 'delete job')
242
- if return_code != 0:
243
- xpk_print(f'Delete job request returned ERROR {return_code}')
244
- return return_code
@@ -1,60 +0,0 @@
1
- """
2
- Copyright 2025 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- from ..core.capacity import (
18
- B200_DEVICE_TYPE,
19
- H100_MEGA_DEVICE_TYPE,
20
- H200_DEVICE_TYPE,
21
- )
22
- from ..core.cluster import get_gpu_type_from_cluster
23
- from ..core.kjob import (
24
- get_a3mega_pod_template_annotations,
25
- get_a3ultra_pod_template_annotations,
26
- get_a4_pod_template_annotations,
27
- Kueue_TAS_annotation,
28
- )
29
- from .common import is_TAS_possible
30
- from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
31
-
32
-
33
- def add_gpu_networking_annotations_to_command(args, cmd: str) -> str:
34
- gpu_type = get_gpu_type_from_cluster(args)
35
-
36
- annotations: tuple
37
- if gpu_type == H100_MEGA_DEVICE_TYPE:
38
- annotations = get_a3mega_pod_template_annotations()
39
- elif gpu_type == H200_DEVICE_TYPE:
40
- annotations = get_a3ultra_pod_template_annotations()
41
- elif gpu_type == B200_DEVICE_TYPE:
42
- annotations = get_a4_pod_template_annotations()
43
- else:
44
- annotations = tuple()
45
-
46
- flags = [
47
- f" --pod-template-annotation {annotation} " for annotation in annotations
48
- ]
49
- cmd += "\\\n".join(flags)
50
-
51
- return cmd
52
-
53
-
54
- def add_TAS_annotations_to_command(args, cmd: str) -> str:
55
- system_characteristics = get_cluster_system_characteristics(args)
56
- capacity_type = get_cluster_capacity_type(args)
57
- if is_TAS_possible(system_characteristics, capacity_type):
58
- cmd += f" --pod-template-annotation {Kueue_TAS_annotation}"
59
-
60
- return cmd
xpk/commands/run.py DELETED
@@ -1,140 +0,0 @@
1
- """
2
- Copyright 2025 Google LLC
3
-
4
- Licensed under the Apache License, Version 2.0 (the "License");
5
- you may not use this file except in compliance with the License.
6
- You may obtain a copy of the License at
7
-
8
- https://www.apache.org/licenses/LICENSE-2.0
9
-
10
- Unless required by applicable law or agreed to in writing, software
11
- distributed under the License is distributed on an "AS IS" BASIS,
12
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- See the License for the specific language governing permissions and
14
- limitations under the License.
15
- """
16
-
17
- from argparse import Namespace
18
-
19
- from ..core.cluster import (
20
- setup_k8s_service_accounts,
21
- get_cluster_credentials,
22
- )
23
- from ..core.commands import run_command_with_full_controls
24
- from ..core.gcloud_context import add_zone_and_project
25
- from ..core.kjob import (
26
- AppProfileDefaults,
27
- JobTemplateDefaults,
28
- get_storage_annotations,
29
- prepare_kjob,
30
- )
31
- from ..core.kueue_manager import LOCAL_QUEUE_NAME
32
- from ..utils.console import xpk_exit, xpk_print
33
- from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
34
- from .kind import set_local_cluster_command
35
- from .kjob_common import add_gpu_networking_annotations_to_command, add_TAS_annotations_to_command
36
-
37
-
38
- def run(args: Namespace) -> None:
39
- """Run task.
40
- This function runs passed script in non-blocking manner.
41
- Args:
42
- args: user provided arguments for running the command.
43
- Returns:
44
- None
45
- """
46
- if should_validate_dependencies(args):
47
- validate_dependencies_list([
48
- SystemDependency.KUBECTL,
49
- SystemDependency.KJOB,
50
- SystemDependency.GCLOUD,
51
- ])
52
- if not args.kind_cluster:
53
- add_zone_and_project(args)
54
- get_cluster_credentials(args)
55
- else:
56
- set_cluster_command_code = set_local_cluster_command(args)
57
- if set_cluster_command_code != 0:
58
- xpk_exit(set_cluster_command_code)
59
-
60
- err_code = prepare_kjob(args)
61
- if err_code > 0:
62
- xpk_exit(err_code)
63
- setup_k8s_service_accounts()
64
-
65
- submit_job(args)
66
-
67
-
68
- def submit_job(args: Namespace) -> None:
69
- cmd = (
70
- 'kubectl kjob create slurm --profile'
71
- f' {AppProfileDefaults.NAME.value} '
72
- f' --localqueue {LOCAL_QUEUE_NAME} '
73
- f' --stream-container {JobTemplateDefaults.CONTAINER_NAME.value}'
74
- f' --worker-container {JobTemplateDefaults.CONTAINER_NAME.value}'
75
- ' --wait --rm --first-node-ip'
76
- )
77
- cmd = add_gpu_networking_annotations_to_command(args, cmd)
78
- cmd = add_TAS_annotations_to_command(args, cmd)
79
-
80
- for annotation in get_storage_annotations(args):
81
- cmd += f' --pod-template-annotation {annotation}'
82
-
83
- if args.timeout:
84
- cmd += f' --wait-timeout {args.timeout}s'
85
-
86
- if args.ignore_unknown_flags:
87
- cmd += ' --ignore-unknown-flags'
88
-
89
- cmd += f' -- {args.script} --partition {LOCAL_QUEUE_NAME}'
90
-
91
- if args.array is not None:
92
- cmd += f' --array {args.array}'
93
-
94
- if args.cpus_per_task is not None:
95
- cmd += f' --cpus-per-task {args.cpus_per_task}'
96
-
97
- if args.gpus_per_task is not None:
98
- cmd += f' --gpus-per-task {args.gpus_per_task}'
99
-
100
- if args.mem is not None:
101
- cmd += f' --mem {args.mem}'
102
-
103
- if args.mem_per_task is not None:
104
- cmd += f' --mem-per-task {args.mem_per_task}'
105
-
106
- if args.mem_per_cpu is not None:
107
- cmd += f' --mem-per-cpu {args.mem_per_cpu}'
108
-
109
- if args.mem_per_gpu is not None:
110
- cmd += f' --mem-per-gpu {args.mem_per_gpu}'
111
-
112
- if args.nodes is not None:
113
- cmd += f' --nodes {args.nodes}'
114
-
115
- if args.ntasks is not None:
116
- cmd += f' --ntasks {args.ntasks}'
117
-
118
- if args.output is not None:
119
- cmd += f' --output {args.output}'
120
-
121
- if args.error is not None:
122
- cmd += f' --error {args.error}'
123
-
124
- if args.input is not None:
125
- cmd += f' --input {args.input}'
126
-
127
- if args.job_name is not None:
128
- cmd += f' --job-name {args.job_name}'
129
-
130
- if args.chdir is not None:
131
- cmd += f' --chdir {args.chdir}'
132
-
133
- if args.time is not None:
134
- cmd += f' --time {args.time}'
135
-
136
- return_code = run_command_with_full_controls(cmd, 'run task')
137
-
138
- if return_code != 0:
139
- xpk_print(f'Running task returned ERROR {return_code}')
140
- xpk_exit(return_code)