xpk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +131 -0
- xpk/commands/cluster.py +808 -0
- xpk/commands/cluster_gcluster.py +269 -0
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +243 -0
- xpk/commands/inspector.py +357 -0
- xpk/commands/job.py +199 -0
- xpk/commands/kind.py +283 -0
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +140 -0
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +27 -0
- xpk/commands/workload.py +889 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +62 -0
- xpk/core/blueprint/blueprint_generator.py +708 -0
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +200 -0
- xpk/core/commands.py +356 -0
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +176 -0
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +444 -0
- xpk/core/kueue.py +358 -0
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +361 -0
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +377 -0
- xpk/core/ray.py +222 -0
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +1432 -0
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +341 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +129 -0
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
- xpk/main.py +75 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +43 -0
- xpk/parser/cluster.py +662 -0
- xpk/parser/common.py +259 -0
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +135 -0
- xpk/parser/info.py +64 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +147 -0
- xpk/parser/kind.py +95 -0
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +59 -0
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +726 -0
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +88 -0
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- xpk/utils/yaml.py +30 -0
- xpk-0.0.1.dist-info/LICENSE +202 -0
- xpk-0.0.1.dist-info/METADATA +1498 -0
- xpk-0.0.1.dist-info/RECORD +92 -0
- xpk-0.0.1.dist-info/WHEEL +5 -0
- xpk-0.0.1.dist-info/entry_points.txt +2 -0
- xpk-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
from ...utils.yaml import literal_string
|
|
19
|
+
|
|
20
|
+
# Component version
|
|
21
|
+
rxdm = 'v1.0.12'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
25
|
+
spec = (
|
|
26
|
+
job_manifest.setdefault('spec', {})
|
|
27
|
+
.setdefault('template', {})
|
|
28
|
+
.setdefault('spec', {})
|
|
29
|
+
)
|
|
30
|
+
spec.setdefault('tolerations', [])
|
|
31
|
+
spec.setdefault('volumes', [])
|
|
32
|
+
|
|
33
|
+
add_volumes(job_manifest)
|
|
34
|
+
add_tolerations(job_manifest)
|
|
35
|
+
add_tcpxo_daemon_container(job_manifest)
|
|
36
|
+
update_gpu_containers(job_manifest)
|
|
37
|
+
return job_manifest
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
|
|
41
|
+
job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
|
|
42
|
+
'metadata', {}
|
|
43
|
+
).setdefault('annotations', {})
|
|
44
|
+
spec = (
|
|
45
|
+
job_manifest.setdefault('spec', {})
|
|
46
|
+
.setdefault('template', {})
|
|
47
|
+
.setdefault('spec', {})
|
|
48
|
+
)
|
|
49
|
+
spec.setdefault('tolerations', [])
|
|
50
|
+
spec.setdefault('volumes', [])
|
|
51
|
+
|
|
52
|
+
add_annotations(job_manifest, sub_networks)
|
|
53
|
+
add_volumes(job_manifest)
|
|
54
|
+
add_tolerations(job_manifest)
|
|
55
|
+
add_tcpxo_daemon_container(job_manifest)
|
|
56
|
+
update_gpu_containers(job_manifest)
|
|
57
|
+
return job_manifest
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def decorate_jobset(jobset_manifest_str, sub_networks) -> str:
|
|
61
|
+
"""
|
|
62
|
+
Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
jobset_manifest_str: The JobSet manifest as a YAML string.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The modified JobSet manifest as a YAML string.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
manifest = yaml.safe_load(jobset_manifest_str)
|
|
72
|
+
|
|
73
|
+
for job in manifest['spec']['replicatedJobs']:
|
|
74
|
+
job_manifest = job['template']
|
|
75
|
+
job_manifest = decorate_job(job_manifest, sub_networks)
|
|
76
|
+
return yaml.dump(manifest, sort_keys=False)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
|
|
80
|
+
interfaces = [
|
|
81
|
+
'[',
|
|
82
|
+
' {"interfaceName":"eth0","network":"default"},',
|
|
83
|
+
*[
|
|
84
|
+
f' {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
|
|
85
|
+
for i in range(8)
|
|
86
|
+
],
|
|
87
|
+
']',
|
|
88
|
+
]
|
|
89
|
+
return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_tcpxo_deamon_entry() -> tuple[str, str]:
|
|
93
|
+
return 'devices.gke.io/container.tcpxo-daemon', literal_string(
|
|
94
|
+
'- path: /dev/nvidia0\n'
|
|
95
|
+
'- path: /dev/nvidia1\n'
|
|
96
|
+
'- path: /dev/nvidia2\n'
|
|
97
|
+
'- path: /dev/nvidia3\n'
|
|
98
|
+
'- path: /dev/nvidia4\n'
|
|
99
|
+
'- path: /dev/nvidia5\n'
|
|
100
|
+
'- path: /dev/nvidia6\n'
|
|
101
|
+
'- path: /dev/nvidia7\n'
|
|
102
|
+
'- path: /dev/nvidiactl\n'
|
|
103
|
+
'- path: /dev/nvidia-uvm\n'
|
|
104
|
+
'- path: /dev/dmabuf_import_helper\n'
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def add_annotations(job_manifest, sub_networks):
|
|
109
|
+
"""Adds or updates annotations in the Pod template."""
|
|
110
|
+
annotations = job_manifest['spec']['template']['metadata']['annotations']
|
|
111
|
+
tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
|
|
112
|
+
interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
|
|
113
|
+
annotations.update({
|
|
114
|
+
tcpxo_deamon_key: tcpxo_deamon_paths,
|
|
115
|
+
'networking.gke.io/default-interface': 'eth0',
|
|
116
|
+
interfaces_key: interfaces_value,
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def add_tolerations(job_manifest):
|
|
121
|
+
"""Adds tolerations to the Pod spec."""
|
|
122
|
+
tolerations = job_manifest['spec']['template']['spec']['tolerations']
|
|
123
|
+
tolerations.append({
|
|
124
|
+
'key': 'user-workload',
|
|
125
|
+
'operator': 'Equal',
|
|
126
|
+
'value': 'true',
|
|
127
|
+
'effect': 'NoSchedule',
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def add_volumes(job_manifest):
|
|
132
|
+
"""Adds volumes to the Pod spec."""
|
|
133
|
+
volumes = job_manifest['spec']['template']['spec']['volumes']
|
|
134
|
+
volumes.append({
|
|
135
|
+
'name': 'libraries',
|
|
136
|
+
'hostPath': {'path': '/home/kubernetes/bin/nvidia'},
|
|
137
|
+
})
|
|
138
|
+
volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
|
|
139
|
+
volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
|
|
140
|
+
volumes.append({
|
|
141
|
+
'name': 'aperture-devices',
|
|
142
|
+
'hostPath': {'path': '/dev/aperture_devices'},
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def add_tcpxo_daemon_container(job_manifest):
|
|
147
|
+
"""Adds the tcpxo-daemon container to the Pod spec."""
|
|
148
|
+
tcpxo_daemon_container = {
|
|
149
|
+
'name': 'tcpxo-daemon',
|
|
150
|
+
'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
|
|
151
|
+
'imagePullPolicy': 'Always',
|
|
152
|
+
'command': ['/bin/sh', '-c'],
|
|
153
|
+
'args': [
|
|
154
|
+
'set -ex\nchmod 755'
|
|
155
|
+
' /fts/entrypoint_rxdm_container.sh\n/fts/entrypoint_rxdm_container.sh'
|
|
156
|
+
' --num_hops=2 --num_nics=8 --uid= --alsologtostderr'
|
|
157
|
+
],
|
|
158
|
+
'securityContext': {
|
|
159
|
+
'capabilities': {'add': ['NET_ADMIN', 'NET_BIND_SERVICE']}
|
|
160
|
+
},
|
|
161
|
+
'volumeMounts': [
|
|
162
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia'},
|
|
163
|
+
{'name': 'sys', 'mountPath': '/hostsysfs'},
|
|
164
|
+
{'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
|
|
165
|
+
],
|
|
166
|
+
'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
|
|
167
|
+
}
|
|
168
|
+
job_manifest['spec']['template']['spec']['containers'].append(
|
|
169
|
+
tcpxo_daemon_container
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def update_gpu_containers(job_manifest):
|
|
174
|
+
for container in job_manifest['spec']['template']['spec']['containers']:
|
|
175
|
+
if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
|
|
176
|
+
container.setdefault('env', [])
|
|
177
|
+
container['env'].append(
|
|
178
|
+
{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
|
|
179
|
+
)
|
|
180
|
+
container['env'].append({
|
|
181
|
+
'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY',
|
|
182
|
+
'value': '/dev/aperture_devices',
|
|
183
|
+
})
|
|
184
|
+
container.setdefault('volumeMounts', [])
|
|
185
|
+
container['volumeMounts'].append(
|
|
186
|
+
{'name': 'aperture-devices', 'mountPath': '/dev/aperture_devices'}
|
|
187
|
+
)
|
|
188
|
+
container['volumeMounts'].append(
|
|
189
|
+
{'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
|
|
190
|
+
)
|
xpk/main.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2023 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
r"""xpk (Accelerated Processing Kit).
|
|
18
|
+
|
|
19
|
+
Next Steps:
|
|
20
|
+
- Cluster describe is broken by Cacheimage since that counts as a workload.
|
|
21
|
+
- Cluster describe: count by jobset.
|
|
22
|
+
- If any instance goes down, bring down the whole job.
|
|
23
|
+
- How to more gracefully handle job failures, distinguishing between software
|
|
24
|
+
and infra?
|
|
25
|
+
- Look into --docker-name and --docker-image.
|
|
26
|
+
Shouldn't one string be adequate to express what we want?
|
|
27
|
+
- Apply learnings from about private, region, coredns, etc:
|
|
28
|
+
- Enable special preheater
|
|
29
|
+
- Make Argparse logic this a function?
|
|
30
|
+
- Obvious logic that starts in main instead of here in code but args will
|
|
31
|
+
not be a universal argument.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import argparse
|
|
35
|
+
import sys
|
|
36
|
+
|
|
37
|
+
from .parser.core import set_parser
|
|
38
|
+
from .utils.console import xpk_print
|
|
39
|
+
from .utils.validation import validate_dependencies
|
|
40
|
+
################### Compatibility Check ###################
|
|
41
|
+
# Check that the user runs the below version or greater.
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
major_version_supported = 3
|
|
45
|
+
minor_version_supported = 10
|
|
46
|
+
|
|
47
|
+
user_major_version = sys.version_info[0]
|
|
48
|
+
user_minor_version = sys.version_info[1]
|
|
49
|
+
if (
|
|
50
|
+
user_major_version < major_version_supported
|
|
51
|
+
or user_minor_version < minor_version_supported
|
|
52
|
+
):
|
|
53
|
+
raise RuntimeError(
|
|
54
|
+
'xpk must be run with Python'
|
|
55
|
+
f' {major_version_supported}.{minor_version_supported} or greater.'
|
|
56
|
+
f' User currently is running {user_major_version}.{user_minor_version}'
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Create top level parser for xpk command.
|
|
60
|
+
parser = argparse.ArgumentParser(description='xpk command', prog='xpk')
|
|
61
|
+
set_parser(parser=parser)
|
|
62
|
+
|
|
63
|
+
xpk_print('Starting xpk', flush=True)
|
|
64
|
+
validate_dependencies()
|
|
65
|
+
main_args = parser.parse_args()
|
|
66
|
+
main_args.enable_ray_cluster = False
|
|
67
|
+
main_args.func(main_args)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def main() -> None:
|
|
71
|
+
xpk_print('XPK Done.', flush=True)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
if __name__ == '__main__':
|
|
75
|
+
main()
|
xpk/parser/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
xpk/parser/batch.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2024 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .common import (
|
|
18
|
+
add_shared_arguments,
|
|
19
|
+
add_slurm_arguments,
|
|
20
|
+
add_cluster_arguments,
|
|
21
|
+
add_kind_cluster_arguments,
|
|
22
|
+
)
|
|
23
|
+
from ..commands.batch import batch
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def set_batch_parser(batch_parser):
|
|
27
|
+
batch_required_arguments = batch_parser.add_argument_group(
|
|
28
|
+
'batch Built-in Arguments', 'Arguments required for `batch`.'
|
|
29
|
+
)
|
|
30
|
+
batch_optional_arguments = batch_parser.add_argument_group(
|
|
31
|
+
'Optional Arguments', 'Arguments optional for `batch`.'
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
### "batch" Required arguments
|
|
35
|
+
batch_required_arguments.add_argument(
|
|
36
|
+
'script', help='script with batch task to run'
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
add_cluster_arguments(batch_optional_arguments)
|
|
40
|
+
add_kind_cluster_arguments(batch_optional_arguments)
|
|
41
|
+
add_shared_arguments(batch_optional_arguments)
|
|
42
|
+
add_slurm_arguments(batch_optional_arguments)
|
|
43
|
+
batch_parser.set_defaults(func=batch)
|