xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,190 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import yaml
18
+ from ...utils.yaml import literal_string
19
+
20
+ # Component version
21
+ rxdm = 'v1.0.12'
22
+
23
+
24
+ def decorate_kjob_template(job_manifest: dict) -> dict:
25
+ spec = (
26
+ job_manifest.setdefault('spec', {})
27
+ .setdefault('template', {})
28
+ .setdefault('spec', {})
29
+ )
30
+ spec.setdefault('tolerations', [])
31
+ spec.setdefault('volumes', [])
32
+
33
+ add_volumes(job_manifest)
34
+ add_tolerations(job_manifest)
35
+ add_tcpxo_daemon_container(job_manifest)
36
+ update_gpu_containers(job_manifest)
37
+ return job_manifest
38
+
39
+
40
+ def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
41
+ job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
42
+ 'metadata', {}
43
+ ).setdefault('annotations', {})
44
+ spec = (
45
+ job_manifest.setdefault('spec', {})
46
+ .setdefault('template', {})
47
+ .setdefault('spec', {})
48
+ )
49
+ spec.setdefault('tolerations', [])
50
+ spec.setdefault('volumes', [])
51
+
52
+ add_annotations(job_manifest, sub_networks)
53
+ add_volumes(job_manifest)
54
+ add_tolerations(job_manifest)
55
+ add_tcpxo_daemon_container(job_manifest)
56
+ update_gpu_containers(job_manifest)
57
+ return job_manifest
58
+
59
+
60
+ def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
61
+ """
62
+ Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
63
+
64
+ Args:
65
+ jobset_manifest_str: The JobSet manifest as a YAML string.
66
+
67
+ Returns:
68
+ The modified JobSet manifest as a YAML string.
69
+ """
70
+
71
+ manifest = yaml.safe_load(jobset_manifest_str)
72
+
73
+ for job in manifest['spec']['replicatedJobs']:
74
+ job_manifest = job['template']
75
+ job_manifest = decorate_job(job_manifest, sub_networks)
76
+ return yaml.dump(manifest, sort_keys=False)
77
+
78
+
79
+ def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
80
+ interfaces = [
81
+ '[',
82
+ ' {"interfaceName":"eth0","network":"default"},',
83
+ *[
84
+ f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
85
+ for i in range(8)
86
+ ],
87
+ ']',
88
+ ]
89
+ return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
90
+
91
+
92
+ def get_tcpxo_deamon_entry() -> tuple[str, str]:
93
+ return 'devices.gke.io/container.tcpxo-daemon', literal_string(
94
+ '- path: /dev/nvidia0\n'
95
+ '- path: /dev/nvidia1\n'
96
+ '- path: /dev/nvidia2\n'
97
+ '- path: /dev/nvidia3\n'
98
+ '- path: /dev/nvidia4\n'
99
+ '- path: /dev/nvidia5\n'
100
+ '- path: /dev/nvidia6\n'
101
+ '- path: /dev/nvidia7\n'
102
+ '- path: /dev/nvidiactl\n'
103
+ '- path: /dev/nvidia-uvm\n'
104
+ '- path: /dev/dmabuf_import_helper\n'
105
+ )
106
+
107
+
108
+ def add_annotations(job_manifest, sub_networks):
109
+ """Adds or updates annotations in the Pod template."""
110
+ annotations = job_manifest['spec']['template']['metadata']['annotations']
111
+ tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
112
+ interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
113
+ annotations.update({
114
+ tcpxo_deamon_key: tcpxo_deamon_paths,
115
+ 'networking.gke.io/default-interface': 'eth0',
116
+ interfaces_key: interfaces_value,
117
+ })
118
+
119
+
120
+ def add_tolerations(job_manifest):
121
+ """Adds tolerations to the Pod spec."""
122
+ tolerations = job_manifest['spec']['template']['spec']['tolerations']
123
+ tolerations.append({
124
+ 'key': 'user-workload',
125
+ 'operator': 'Equal',
126
+ 'value': 'true',
127
+ 'effect': 'NoSchedule',
128
+ })
129
+
130
+
131
+ def add_volumes(job_manifest):
132
+ """Adds volumes to the Pod spec."""
133
+ volumes = job_manifest['spec']['template']['spec']['volumes']
134
+ volumes.append({
135
+ 'name': 'libraries',
136
+ 'hostPath': {'path': '/home/kubernetes/bin/nvidia'},
137
+ })
138
+ volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
139
+ volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
140
+ volumes.append({
141
+ 'name': 'aperture-devices',
142
+ 'hostPath': {'path': '/dev/aperture_devices'},
143
+ })
144
+
145
+
146
+ def add_tcpxo_daemon_container(job_manifest):
147
+ """Adds the tcpxo-daemon container to the Pod spec."""
148
+ tcpxo_daemon_container = {
149
+ 'name': 'tcpxo-daemon',
150
+ 'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
151
+ 'imagePullPolicy': 'Always',
152
+ 'command': ['/bin/sh', '-c'],
153
+ 'args': [
154
+ 'set -ex\nchmod 755'
155
+ ' /fts/entrypoint_rxdm_container.sh\n/fts/entrypoint_rxdm_container.sh'
156
+ ' --num_hops=2 --num_nics=8 --uid= --alsologtostderr'
157
+ ],
158
+ 'securityContext': {
159
+ 'capabilities': {'add': ['NET_ADMIN', 'NET_BIND_SERVICE']}
160
+ },
161
+ 'volumeMounts': [
162
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia'},
163
+ {'name': 'sys', 'mountPath': '/hostsysfs'},
164
+ {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
165
+ ],
166
+ 'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
167
+ }
168
+ job_manifest['spec']['template']['spec']['containers'].append(
169
+ tcpxo_daemon_container
170
+ )
171
+
172
+
173
+ def update_gpu_containers(job_manifest):
174
+ for container in job_manifest['spec']['template']['spec']['containers']:
175
+ if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
176
+ container.setdefault('env', [])
177
+ container['env'].append(
178
+ {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
179
+ )
180
+ container['env'].append({
181
+ 'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY',
182
+ 'value': '/dev/aperture_devices',
183
+ })
184
+ container.setdefault('volumeMounts', [])
185
+ container['volumeMounts'].append(
186
+ {'name': 'aperture-devices', 'mountPath': '/dev/aperture_devices'}
187
+ )
188
+ container['volumeMounts'].append(
189
+ {'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
190
+ )
xpk/main.py ADDED
@@ -0,0 +1,75 @@
1
+ """
2
+ Copyright 2023 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ r"""xpk (Accelerated Processing Kit).
18
+
19
+ Next Steps:
20
+ - Cluster describe is broken by Cacheimage since that counts as a workload.
21
+ - Cluster describe: count by jobset.
22
+ - If any instance goes down, bring down the whole job.
23
+ - How to more gracefully handle job failures, distinguishing between software
24
+ and infra?
25
+ - Look into --docker-name and --docker-image.
26
+ Shouldn't one string be adequate to express what we want?
27
+ - Apply learnings from about private, region, coredns, etc:
28
+ - Enable special preheater
29
+ - Make Argparse logic this a function?
30
+ - Obvious logic that starts in main instead of here in code but args will
31
+ not be a universal argument.
32
+ """
33
+
34
+ import argparse
35
+ import sys
36
+
37
+ from .parser.core import set_parser
38
+ from .utils.console import xpk_print
39
+ from .utils.validation import validate_dependencies
40
+ ################### Compatibility Check ###################
41
+ # Check that the user runs the below version or greater.
42
+
43
+
44
+ major_version_supported = 3
45
+ minor_version_supported = 10
46
+
47
+ user_major_version = sys.version_info[0]
48
+ user_minor_version = sys.version_info[1]
49
+ if (
50
+ user_major_version < major_version_supported
51
+ or user_minor_version < minor_version_supported
52
+ ):
53
+ raise RuntimeError(
54
+ 'xpk must be run with Python'
55
+ f' {major_version_supported}.{minor_version_supported} or greater.'
56
+ f' User currently is running {user_major_version}.{user_minor_version}'
57
+ )
58
+
59
+ # Create top level parser for xpk command.
60
+ parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
61
+ set_parser(parser=parser)
62
+
63
+ xpk_print('Starting xpk', flush=True)
64
+ validate_dependencies()
65
+ main_args = parser.parse_args()
66
+ main_args.enable_ray_cluster = False
67
+ main_args.func(main_args)
68
+
69
+
70
+ def main() -> None:
71
+ xpk_print('XPK Done.', flush=True)
72
+
73
+
74
+ if __name__ == '__main__':
75
+ main()
xpk/parser/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
xpk/parser/batch.py ADDED
@@ -0,0 +1,43 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from .common import (
18
+ add_shared_arguments,
19
+ add_slurm_arguments,
20
+ add_cluster_arguments,
21
+ add_kind_cluster_arguments,
22
+ )
23
+ from ..commands.batch import batch
24
+
25
+
26
+ def set_batch_parser(batch_parser):
27
+ batch_required_arguments = batch_parser.add_argument_group(
28
+ 'batch Built-in Arguments', 'Arguments required for `batch`.'
29
+ )
30
+ batch_optional_arguments = batch_parser.add_argument_group(
31
+ 'Optional Arguments', 'Arguments optional for `batch`.'
32
+ )
33
+
34
+ ### "batch" Required arguments
35
+ batch_required_arguments.add_argument(
36
+ 'script', help='script with batch task to run'
37
+ )
38
+
39
+ add_cluster_arguments(batch_optional_arguments)
40
+ add_kind_cluster_arguments(batch_optional_arguments)
41
+ add_shared_arguments(batch_optional_arguments)
42
+ add_slurm_arguments(batch_optional_arguments)
43
+ batch_parser.set_defaults(func=batch)