xpk 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/METADATA +301 -28
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.5.0.dist-info/RECORD +0 -7
  57. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7282
  59. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.5.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
xpk/core/workload.py ADDED
@@ -0,0 +1,133 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .commands import run_command_for_value
18
+
19
+
20
+ def workload_list_awk_command(filter_key) -> str:
21
+ """Function returns the awk command needed from the filter specified.
22
+
23
+ Args:
24
+ filter_key: workload list filter to awk against
25
+
26
+ Returns:
27
+ awk command to use in filtering workload list.
28
+ """
29
+
30
+ return f" | awk -e 'NR == 1 || {filter_key} {{print $0}}'"
31
+
32
+
33
+ def determine_workload_list_filter_by_status(args) -> str:
34
+ """Function to create the filtered view of workload list.
35
+
36
+ Args:
37
+ args: user provided arguments for running the command.
38
+
39
+ Returns:
40
+ the argument needed to filter by status of jobs in workload list.
41
+ """
42
+
43
+ # Argument positions related to columns created by workload list command.
44
+ status_arg = '$7'
45
+ running_vms_arg = '$5'
46
+ status_verbose_arg = '$9'
47
+ if args.filter_by_status == 'EVERYTHING':
48
+ return ''
49
+ elif args.filter_by_status == 'RUNNING':
50
+ # Running includes the status Admitted or Evicted, and when the number of
51
+ # vms running is > 0.
52
+ return workload_list_awk_command(
53
+ f'({status_arg} ~ "Admitted|Evicted" && {running_vms_arg} ~ /^[0-9]+$/'
54
+ f' && {running_vms_arg} > 0)'
55
+ )
56
+ elif args.filter_by_status == 'QUEUED':
57
+ # Queued includes the status Admitted or Evicted, and when the number of
58
+ # vms running is 0.
59
+ return workload_list_awk_command(
60
+ f'({status_arg} ~ "Admitted|Evicted|QuotaReserved" &&'
61
+ f' ({running_vms_arg} ~ "<none>" || {running_vms_arg} == 0))'
62
+ )
63
+ elif args.filter_by_status == 'FINISHED':
64
+ return workload_list_awk_command(f'{status_arg} == "Finished"')
65
+ elif args.filter_by_status == 'FAILED':
66
+ # Failed includes the status Finished, and when the verbose reason is failed.
67
+ return workload_list_awk_command(
68
+ f'({status_arg} == "Finished" && {status_verbose_arg} ~ "failed")'
69
+ )
70
+ elif args.filter_by_status == 'SUCCESSFUL':
71
+ # Failed includes the status Finished, and when the verbose reason is finished/success.
72
+ return workload_list_awk_command(
73
+ f'({status_arg} == "Finished" && {status_verbose_arg} ~ "finished")'
74
+ )
75
+ raise RuntimeError(f'Can not find filter type: {args.filter_by_status}')
76
+
77
+
78
+ def determine_workload_list_filter_by_job(args) -> str:
79
+ """Function to filter view of workload list based on job name.
80
+
81
+ Args:
82
+ args: user provided arguments for running the command.
83
+
84
+ Returns:
85
+ the argument needed to filter job names from workload list
86
+ """
87
+ # Argument positions related to columns created by workload list command.
88
+ if not hasattr(args, 'filter_by_job') or args.filter_by_job is None:
89
+ return ''
90
+ else:
91
+ job_name_arg = '$1'
92
+ return workload_list_awk_command(f'{job_name_arg} ~ "{args.filter_by_job}"')
93
+
94
+
95
+ def get_workload_list(args) -> tuple[int, str]:
96
+ """Function to get the list of the workloads in the cluster.
97
+
98
+ Args:
99
+ args: user provided arguments for running the command.
100
+
101
+ Returns:
102
+ return_code: 0 if successful and 1 otherwise.
103
+ return_value: workloads in the cluster matching the criteria.
104
+ """
105
+ columns = {
106
+ 'Jobset Name': '.metadata.ownerReferences[0].name',
107
+ 'Created Time': '.metadata.creationTimestamp',
108
+ 'Priority': '.spec.priorityClassName',
109
+ 'TPU VMs Needed': '.spec.podSets[0].count',
110
+ 'TPU VMs Running/Ran': '.status.admission.podSetAssignments[-1].count',
111
+ 'TPU VMs Done': '.status.reclaimablePods[0].count',
112
+ 'Status': '.status.conditions[-1].type',
113
+ 'Status Message': '.status.conditions[-1].message',
114
+ 'Status Time': '.status.conditions[-1].lastTransitionTime',
115
+ }
116
+ s = ','.join([key + ':' + value for key, value in columns.items()])
117
+
118
+ workload_list_filter_status_cmd = determine_workload_list_filter_by_status(
119
+ args
120
+ )
121
+ workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
122
+ command = (
123
+ f'kubectl get workloads -o=custom-columns="{s}" '
124
+ f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
125
+ )
126
+
127
+ task = f'List Jobs with filter-by-status={args.filter_by_status}'
128
+ if hasattr(args, 'filter_by_job'):
129
+ task += f' with filter-by-job={args.filter_by_job}'
130
+
131
+ return_code, return_value = run_command_for_value(command, task, args)
132
+
133
+ return return_code, return_value
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
@@ -0,0 +1,109 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import yaml
18
+ from ...utils.yaml import literal_string
19
+
20
+
21
+ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
22
+ """
23
+ Decorates a JobSet manifest with the necessary components for rdma-daemon.
24
+
25
+ Args:
26
+ jobset_manifest_str: The JobSet manifest as a YAML string.
27
+
28
+ Returns:
29
+ The modified JobSet manifest as a YAML string.
30
+ """
31
+
32
+ manifest = yaml.safe_load(jobset_manifest_str)
33
+
34
+ for job in manifest['spec']['replicatedJobs']:
35
+ job_manifest = job['template']
36
+ job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
37
+ 'metadata', {}
38
+ ).setdefault('annotations', {})
39
+ spec = (
40
+ job_manifest.setdefault('spec', {})
41
+ .setdefault('template', {})
42
+ .setdefault('spec', {})
43
+ )
44
+ spec.setdefault('tolerations', [])
45
+ spec.setdefault('volumes', [])
46
+
47
+ add_annotations(job_manifest, sub_networks)
48
+ add_volumes(job_manifest)
49
+ add_tolerations(job_manifest)
50
+ update_gpu_containers(job_manifest)
51
+
52
+ return yaml.dump(manifest, sort_keys=False)
53
+
54
+
55
+ def add_annotations(job_manifest, sub_networks):
56
+ """Adds or updates annotations in the Pod template."""
57
+ annotations = job_manifest['spec']['template']['metadata']['annotations']
58
+ interfaces = [
59
+ '[',
60
+ ' {"interfaceName":"eth0","network":"default"},',
61
+ *[
62
+ f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<8 else ""}'
63
+ for i in range(9)
64
+ ],
65
+ ']',
66
+ ]
67
+ annotations.update({
68
+ 'networking.gke.io/default-interface': 'eth0',
69
+ 'networking.gke.io/interfaces': literal_string('\n'.join(interfaces)),
70
+ })
71
+
72
+
73
+ def add_volumes(job_manifest):
74
+ """Adds volumes to the Pod spec."""
75
+ volumes = job_manifest['spec']['template']['spec']['volumes']
76
+ volumes.append({
77
+ 'name': 'library-dir-host',
78
+ 'hostPath': {'path': '/home/kubernetes/bin/nvidia'},
79
+ })
80
+ volumes.append(
81
+ {'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
82
+ )
83
+
84
+
85
+ def add_tolerations(job_manifest):
86
+ """Adds tolerations to the Pod spec."""
87
+ tolerations = job_manifest['spec']['template']['spec']['tolerations']
88
+ tolerations.append({
89
+ 'key': 'user-workload',
90
+ 'operator': 'Equal',
91
+ 'value': 'true',
92
+ 'effect': 'NoSchedule',
93
+ })
94
+
95
+
96
+ def update_gpu_containers(job_manifest):
97
+ for container in job_manifest['spec']['template']['spec']['containers']:
98
+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
99
+ container.setdefault('env', [])
100
+ container['env'].append(
101
+ {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
102
+ )
103
+ container.setdefault('volumeMounts', [])
104
+ container['volumeMounts'].append(
105
+ {'name': 'library-dir-host', 'mountPath': '/usr/local/nvidia'}
106
+ )
107
+ container['volumeMounts'].append(
108
+ {'name': 'gib', 'mountPath': '/usr/local/gib'}
109
+ )
@@ -0,0 +1,157 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import yaml
18
+ from ...utils.yaml import literal_string
19
+
20
+ # Component version
21
+ rxdm = 'v1.0.12'
22
+
23
+
24
+ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
25
+ """
26
+ Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
27
+
28
+ Args:
29
+ jobset_manifest_str: The JobSet manifest as a YAML string.
30
+
31
+ Returns:
32
+ The modified JobSet manifest as a YAML string.
33
+ """
34
+
35
+ manifest = yaml.safe_load(jobset_manifest_str)
36
+
37
+ for job in manifest['spec']['replicatedJobs']:
38
+ job_manifest = job['template']
39
+ job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
40
+ 'metadata', {}
41
+ ).setdefault('annotations', {})
42
+ spec = (
43
+ job_manifest.setdefault('spec', {})
44
+ .setdefault('template', {})
45
+ .setdefault('spec', {})
46
+ )
47
+ spec.setdefault('tolerations', [])
48
+ spec.setdefault('volumes', [])
49
+
50
+ add_annotations(job_manifest, sub_networks)
51
+ add_volumes(job_manifest)
52
+ add_tolerations(job_manifest)
53
+ add_tcpxo_daemon_container(job_manifest)
54
+ update_gpu_containers(job_manifest)
55
+
56
+ return yaml.dump(manifest, sort_keys=False)
57
+
58
+
59
+ def add_annotations(job_manifest, sub_networks):
60
+ """Adds or updates annotations in the Pod template."""
61
+ annotations = job_manifest['spec']['template']['metadata']['annotations']
62
+ interfaces = [
63
+ '[',
64
+ ' {"interfaceName":"eth0","network":"default"},',
65
+ *[
66
+ f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
67
+ for i in range(8)
68
+ ],
69
+ ']',
70
+ ]
71
+ annotations.update({
72
+ 'devices.gke.io/container.tcpxo-daemon': literal_string(
73
+ '- path: /dev/nvidia0\n'
74
+ '- path: /dev/nvidia1\n'
75
+ '- path: /dev/nvidia2\n'
76
+ '- path: /dev/nvidia3\n'
77
+ '- path: /dev/nvidia4\n'
78
+ '- path: /dev/nvidia5\n'
79
+ '- path: /dev/nvidia6\n'
80
+ '- path: /dev/nvidia7\n'
81
+ '- path: /dev/nvidiactl\n'
82
+ '- path: /dev/nvidia-uvm\n'
83
+ '- path: /dev/dmabuf_import_helper\n'
84
+ ),
85
+ 'networking.gke.io/default-interface': 'eth0',
86
+ 'networking.gke.io/interfaces': literal_string('\n'.join(interfaces)),
87
+ })
88
+
89
+
90
+ def add_tolerations(job_manifest):
91
+ """Adds tolerations to the Pod spec."""
92
+ tolerations = job_manifest['spec']['template']['spec']['tolerations']
93
+ tolerations.append({
94
+ 'key': 'user-workload',
95
+ 'operator': 'Equal',
96
+ 'value': 'true',
97
+ 'effect': 'NoSchedule',
98
+ })
99
+
100
+
101
+ def add_volumes(job_manifest):
102
+ """Adds volumes to the Pod spec."""
103
+ volumes = job_manifest['spec']['template']['spec']['volumes']
104
+ volumes.append({
105
+ 'name': 'libraries',
106
+ 'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
107
+ })
108
+ volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
109
+ volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
110
+ volumes.append({
111
+ 'name': 'aperture-devices',
112
+ 'hostPath': {'path': '/dev/aperture_devices'},
113
+ })
114
+
115
+
116
+ def add_tcpxo_daemon_container(job_manifest):
117
+ """Adds the tcpxo-daemon container to the Pod spec."""
118
+ tcpxo_daemon_container = {
119
+ 'name': 'tcpxo-daemon',
120
+ 'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
121
+ 'imagePullPolicy': 'Always',
122
+ 'command': ['/bin/sh', '-c'],
123
+ 'args': [
124
+ 'set -ex\nchmod 755'
125
+ ' /fts/entrypoint_rxdm_container.sh\n/fts/entrypoint_rxdm_container.sh'
126
+ ' --num_hops=2 --num_nics=8 --uid= --alsologtostderr'
127
+ ],
128
+ 'securityContext': {
129
+ 'capabilities': {'add': ['NET_ADMIN', 'NET_BIND_SERVICE']}
130
+ },
131
+ 'volumeMounts': [
132
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia'},
133
+ {'name': 'sys', 'mountPath': '/hostsysfs'},
134
+ {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
135
+ ],
136
+ 'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
137
+ }
138
+ job_manifest['spec']['template']['spec']['containers'].insert(
139
+ 0, tcpxo_daemon_container
140
+ )
141
+
142
+
143
+ def update_gpu_containers(job_manifest):
144
+ for container in job_manifest['spec']['template']['spec']['containers']:
145
+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
146
+ container.setdefault('env', [])
147
+ container['env'].append(
148
+ {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
149
+ )
150
+ container['env'].append({
151
+ 'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY',
152
+ 'value': '/dev/aperture_devices',
153
+ })
154
+ container.setdefault('volumeMounts', [])
155
+ container['volumeMounts'].append(
156
+ {'name': 'aperture-devices', 'mountPath': '/dev/aperture_devices'}
157
+ )
xpk/main.py ADDED
@@ -0,0 +1,73 @@
1
+ """
2
+ Copyright 2023 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ r"""xpk (Accelerated Processing Kit).
18
+
19
+ Next Steps:
20
+ - Cluster describe is broken by Cacheimage since that counts as a workload.
21
+ - Cluster describe: count by jobset.
22
+ - If any instance goes down, bring down the whole job.
23
+ - How to more gracefully handle job failures, distinguishing between software
24
+ and infra?
25
+ - Look into --docker-name and --docker-image.
26
+ Shouldn't one string be adequate to express what we want?
27
+ - Apply learnings from about private, region, coredns, etc:
28
+ - Enable special preheater
29
+ - Make Argparse logic this a function?
30
+ - Obvious logic that starts in main instead of here in code but args will
31
+ not be a universal argument.
32
+ """
33
+
34
+ import argparse
35
+ import sys
36
+
37
+ from .parser.core import set_parser
38
+ from .utils.console import xpk_print
39
+
40
+ ################### Compatibility Check ###################
41
+ # Check that the user runs the below version or greater.
42
+
43
+ major_version_supported = 3
44
+ minor_version_supported = 10
45
+
46
+ user_major_version = sys.version_info[0]
47
+ user_minor_version = sys.version_info[1]
48
+ if (
49
+ user_major_version < major_version_supported
50
+ or user_minor_version < minor_version_supported
51
+ ):
52
+ raise RuntimeError(
53
+ 'xpk must be run with Python'
54
+ f' {major_version_supported}.{minor_version_supported} or greater.'
55
+ f' User currently is running {user_major_version}.{user_minor_version}'
56
+ )
57
+
58
+ # Create top level parser for xpk command.
59
+ parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
60
+ set_parser(parser=parser)
61
+
62
+ xpk_print('Starting xpk', flush=True)
63
+ main_args = parser.parse_args()
64
+ main_args.enable_ray_cluster = False
65
+ main_args.func(main_args)
66
+
67
+
68
+ def main() -> None:
69
+ xpk_print('XPK Done.', flush=True)
70
+
71
+
72
+ if __name__ == '__main__':
73
+ main()
xpk/parser/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
xpk/parser/batch.py ADDED
@@ -0,0 +1,184 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import argparse
18
+
19
+ from .common import add_shared_arguments
20
+ from ..commands.batch import batch
21
+
22
+
23
+ def set_batch_parser(batch_parser):
24
+ batch_required_arguments = batch_parser.add_argument_group(
25
+ 'batch Built-in Arguments', 'Arguments required for `batch`.'
26
+ )
27
+ batch_optional_arguments = batch_parser.add_argument_group(
28
+ 'Optional Arguments', 'Arguments optional for `batch`.'
29
+ )
30
+
31
+ ### "batch" Required arguments
32
+ batch_required_arguments.add_argument(
33
+ 'script', help='script with batch task to run'
34
+ )
35
+ batch_optional_arguments.add_argument(
36
+ '--cluster',
37
+ type=str,
38
+ default=None,
39
+ help='Cluster to which command applies.',
40
+ )
41
+ batch_optional_arguments.add_argument(
42
+ '--kind-cluster',
43
+ type=bool,
44
+ action=argparse.BooleanOptionalAction,
45
+ default=False,
46
+ help='Apply command to a local test cluster.',
47
+ )
48
+ add_shared_arguments(batch_optional_arguments)
49
+
50
+ batch_parser.set_defaults(func=batch)
51
+
52
+ batch_optional_arguments.add_argument(
53
+ '--ignore-unknown-flags',
54
+ type=bool,
55
+ action=argparse.BooleanOptionalAction,
56
+ default=False,
57
+ help='Ignore all the unsupported flags in the bash script.',
58
+ )
59
+ batch_optional_arguments.add_argument(
60
+ '-a',
61
+ '--array',
62
+ type=str,
63
+ default=None,
64
+ help=(
65
+ 'Submit a job array, multiple jobs to be executed with identical'
66
+ ' parameters. The indexes specification identifies what array index'
67
+ ' values should be used. For example, "--array=0-15" or'
68
+ ' "--array=0,6,16-32". Multiple values may be specified using a comma'
69
+ ' separated list and/or a range of values with a "-" separator. For'
70
+ ' example "--array=0-15%%4" will limit the number of simultaneously'
71
+ ' running tasks from this job array to 4. The minimum index value is'
72
+ ' 0. The maximum index value is 2147483647.'
73
+ ),
74
+ )
75
+ batch_optional_arguments.add_argument(
76
+ '-c',
77
+ '--cpus-per-task',
78
+ type=str,
79
+ default=None,
80
+ help='How much cpus a container inside a pod requires.',
81
+ )
82
+ batch_optional_arguments.add_argument(
83
+ '--gpus-per-task',
84
+ type=str,
85
+ default=None,
86
+ help='How much gpus a container inside a pod requires.',
87
+ )
88
+ batch_optional_arguments.add_argument(
89
+ '--mem',
90
+ type=str,
91
+ default=None,
92
+ help='How much memory a pod requires.',
93
+ )
94
+ batch_optional_arguments.add_argument(
95
+ '--mem-per-task',
96
+ type=str,
97
+ default=None,
98
+ help='How much memory a container requires.',
99
+ )
100
+ batch_optional_arguments.add_argument(
101
+ '--mem-per-cpu',
102
+ type=str,
103
+ default=None,
104
+ help=(
105
+ 'How much memory a container requires, it multiplies the number '
106
+ 'of requested cpus per task by mem-per-cpu.'
107
+ ),
108
+ )
109
+ batch_optional_arguments.add_argument(
110
+ '--mem-per-gpu',
111
+ type=str,
112
+ default=None,
113
+ help=(
114
+ 'How much memory a container requires, it multiplies the number '
115
+ 'of requested gpus per task by mem-per-gpu.'
116
+ ),
117
+ )
118
+ batch_optional_arguments.add_argument(
119
+ '-N',
120
+ '--nodes',
121
+ type=int,
122
+ default=None,
123
+ help='Number of pods to be used at a time.',
124
+ )
125
+ batch_optional_arguments.add_argument(
126
+ '-n',
127
+ '--ntasks',
128
+ type=int,
129
+ default=None,
130
+ help='Number of identical containers inside of a pod, usually 1.',
131
+ )
132
+ batch_optional_arguments.add_argument(
133
+ '-o',
134
+ '--output',
135
+ type=str,
136
+ default=None,
137
+ help=(
138
+ 'Where to redirect the standard output stream of a task. If not'
139
+ ' passed it proceeds to stdout, and is available via kubectl logs.'
140
+ ),
141
+ )
142
+ batch_optional_arguments.add_argument(
143
+ '-e',
144
+ '--error',
145
+ type=str,
146
+ default=None,
147
+ help=(
148
+ 'Where to redirect std error stream of a task. If not passed it'
149
+ ' proceeds to stdout, and is available via kubectl logs.'
150
+ ),
151
+ )
152
+ batch_optional_arguments.add_argument(
153
+ '--input',
154
+ type=str,
155
+ default=None,
156
+ help='What to pipe into the script.',
157
+ )
158
+ batch_optional_arguments.add_argument(
159
+ '-J',
160
+ '--job-name',
161
+ type=str,
162
+ default=None,
163
+ help='What is the job name.',
164
+ )
165
+ batch_optional_arguments.add_argument(
166
+ '-D',
167
+ '--chdir',
168
+ type=str,
169
+ default=None,
170
+ help='Change directory before executing the script.',
171
+ )
172
+ batch_optional_arguments.add_argument(
173
+ '-t',
174
+ '--time',
175
+ type=str,
176
+ default=None,
177
+ help=(
178
+ 'Set a limit on the total run time of the job. '
179
+ 'A time limit of zero requests that no time limit be imposed. '
180
+ 'Acceptable time formats include "minutes", "minutes:seconds", '
181
+ '"hours:minutes:seconds", "days-hours", "days-hours:minutes" '
182
+ 'and "days-hours:minutes:seconds".'
183
+ ),
184
+ )