xpk 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/__init__.py +15 -0
- xpk/api/__init__.py +15 -0
- xpk/api/storage_crd.yaml +52 -0
- xpk/commands/__init__.py +15 -0
- xpk/commands/batch.py +131 -0
- xpk/commands/cluster.py +808 -0
- xpk/commands/cluster_gcluster.py +269 -0
- xpk/commands/common.py +44 -0
- xpk/commands/config.py +29 -0
- xpk/commands/info.py +243 -0
- xpk/commands/inspector.py +357 -0
- xpk/commands/job.py +199 -0
- xpk/commands/kind.py +283 -0
- xpk/commands/kjob_common.py +44 -0
- xpk/commands/run.py +128 -0
- xpk/commands/shell.py +140 -0
- xpk/commands/storage.py +267 -0
- xpk/commands/version.py +27 -0
- xpk/commands/workload.py +889 -0
- xpk/core/__init__.py +15 -0
- xpk/core/blueprint/__init__.py +15 -0
- xpk/core/blueprint/blueprint_definitions.py +62 -0
- xpk/core/blueprint/blueprint_generator.py +708 -0
- xpk/core/capacity.py +185 -0
- xpk/core/cluster.py +564 -0
- xpk/core/cluster_private.py +200 -0
- xpk/core/commands.py +356 -0
- xpk/core/config.py +179 -0
- xpk/core/docker_container.py +225 -0
- xpk/core/docker_image.py +210 -0
- xpk/core/docker_manager.py +308 -0
- xpk/core/docker_resources.py +350 -0
- xpk/core/filestore.py +251 -0
- xpk/core/gcloud_context.py +196 -0
- xpk/core/gcluster_manager.py +176 -0
- xpk/core/gcsfuse.py +50 -0
- xpk/core/kjob.py +444 -0
- xpk/core/kueue.py +358 -0
- xpk/core/monitoring.py +134 -0
- xpk/core/nap.py +361 -0
- xpk/core/network.py +377 -0
- xpk/core/nodepool.py +581 -0
- xpk/core/pathways.py +377 -0
- xpk/core/ray.py +222 -0
- xpk/core/remote_state/__init__.py +15 -0
- xpk/core/remote_state/fuse_remote_state.py +99 -0
- xpk/core/remote_state/remote_state_client.py +38 -0
- xpk/core/resources.py +238 -0
- xpk/core/scheduling.py +253 -0
- xpk/core/storage.py +581 -0
- xpk/core/system_characteristics.py +1432 -0
- xpk/core/vertex.py +105 -0
- xpk/core/workload.py +341 -0
- xpk/core/workload_decorators/__init__.py +15 -0
- xpk/core/workload_decorators/rdma_decorator.py +129 -0
- xpk/core/workload_decorators/storage_decorator.py +52 -0
- xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
- xpk/main.py +75 -0
- xpk/parser/__init__.py +15 -0
- xpk/parser/batch.py +43 -0
- xpk/parser/cluster.py +662 -0
- xpk/parser/common.py +259 -0
- xpk/parser/config.py +49 -0
- xpk/parser/core.py +135 -0
- xpk/parser/info.py +64 -0
- xpk/parser/inspector.py +65 -0
- xpk/parser/job.py +147 -0
- xpk/parser/kind.py +95 -0
- xpk/parser/run.py +47 -0
- xpk/parser/shell.py +59 -0
- xpk/parser/storage.py +316 -0
- xpk/parser/validators.py +39 -0
- xpk/parser/version.py +23 -0
- xpk/parser/workload.py +726 -0
- xpk/templates/__init__.py +15 -0
- xpk/templates/storage.yaml +13 -0
- xpk/utils/__init__.py +15 -0
- xpk/utils/console.py +55 -0
- xpk/utils/file.py +82 -0
- xpk/utils/gcs_utils.py +125 -0
- xpk/utils/kubectl.py +57 -0
- xpk/utils/network.py +168 -0
- xpk/utils/objects.py +88 -0
- xpk/utils/templates.py +28 -0
- xpk/utils/validation.py +80 -0
- xpk/utils/yaml.py +30 -0
- xpk-0.0.1.dist-info/LICENSE +202 -0
- xpk-0.0.1.dist-info/METADATA +1498 -0
- xpk-0.0.1.dist-info/RECORD +92 -0
- xpk-0.0.1.dist-info/WHEEL +5 -0
- xpk-0.0.1.dist-info/entry_points.txt +2 -0
- xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/network.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright 2025 Google LLC
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ..utils.console import xpk_print
|
|
18
|
+
from ..utils.file import write_tmp_file
|
|
19
|
+
from .capacity import H100_DEVICE_TYPE
|
|
20
|
+
from .commands import run_command_for_value, run_command_with_updates
|
|
21
|
+
from .gcloud_context import zone_to_region
|
|
22
|
+
from .system_characteristics import SystemCharacteristics
|
|
23
|
+
|
|
24
|
+
# cluster_network_yaml: the config when creating the network for a3 cluster
|
|
25
|
+
CLUSTER_NETWORK_YAML = """
|
|
26
|
+
apiVersion: networking.gke.io/v1
|
|
27
|
+
kind: Network
|
|
28
|
+
metadata:
|
|
29
|
+
name: vpc1
|
|
30
|
+
spec:
|
|
31
|
+
parametersRef:
|
|
32
|
+
group: networking.gke.io
|
|
33
|
+
kind: GKENetworkParamSet
|
|
34
|
+
name: vpc1
|
|
35
|
+
type: Device
|
|
36
|
+
---
|
|
37
|
+
apiVersion: networking.gke.io/v1
|
|
38
|
+
kind: Network
|
|
39
|
+
metadata:
|
|
40
|
+
name: vpc2
|
|
41
|
+
spec:
|
|
42
|
+
parametersRef:
|
|
43
|
+
group: networking.gke.io
|
|
44
|
+
kind: GKENetworkParamSet
|
|
45
|
+
name: vpc2
|
|
46
|
+
type: Device
|
|
47
|
+
---
|
|
48
|
+
apiVersion: networking.gke.io/v1
|
|
49
|
+
kind: Network
|
|
50
|
+
metadata:
|
|
51
|
+
name: vpc3
|
|
52
|
+
spec:
|
|
53
|
+
parametersRef:
|
|
54
|
+
group: networking.gke.io
|
|
55
|
+
kind: GKENetworkParamSet
|
|
56
|
+
name: vpc3
|
|
57
|
+
type: Device
|
|
58
|
+
---
|
|
59
|
+
apiVersion: networking.gke.io/v1
|
|
60
|
+
kind: Network
|
|
61
|
+
metadata:
|
|
62
|
+
name: vpc4
|
|
63
|
+
spec:
|
|
64
|
+
parametersRef:
|
|
65
|
+
group: networking.gke.io
|
|
66
|
+
kind: GKENetworkParamSet
|
|
67
|
+
name: vpc4
|
|
68
|
+
type: Device
|
|
69
|
+
---
|
|
70
|
+
apiVersion: networking.gke.io/v1
|
|
71
|
+
kind: GKENetworkParamSet
|
|
72
|
+
metadata:
|
|
73
|
+
name: vpc1
|
|
74
|
+
spec:
|
|
75
|
+
vpc: {cluster_name}-net-1
|
|
76
|
+
vpcSubnet: {cluster_name}-sub-1
|
|
77
|
+
deviceMode: NetDevice
|
|
78
|
+
---
|
|
79
|
+
apiVersion: networking.gke.io/v1
|
|
80
|
+
kind: GKENetworkParamSet
|
|
81
|
+
metadata:
|
|
82
|
+
name: vpc2
|
|
83
|
+
spec:
|
|
84
|
+
vpc: {cluster_name}-net-2
|
|
85
|
+
vpcSubnet: {cluster_name}-sub-2
|
|
86
|
+
deviceMode: NetDevice
|
|
87
|
+
---
|
|
88
|
+
apiVersion: networking.gke.io/v1
|
|
89
|
+
kind: GKENetworkParamSet
|
|
90
|
+
metadata:
|
|
91
|
+
name: vpc3
|
|
92
|
+
spec:
|
|
93
|
+
vpc: {cluster_name}-net-3
|
|
94
|
+
vpcSubnet: {cluster_name}-sub-3
|
|
95
|
+
deviceMode: NetDevice
|
|
96
|
+
---
|
|
97
|
+
apiVersion: networking.gke.io/v1
|
|
98
|
+
kind: GKENetworkParamSet
|
|
99
|
+
metadata:
|
|
100
|
+
name: vpc4
|
|
101
|
+
spec:
|
|
102
|
+
vpc: {cluster_name}-net-4
|
|
103
|
+
vpcSubnet: {cluster_name}-sub-4
|
|
104
|
+
deviceMode: NetDevice
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def create_cluster_network(args, index) -> int:
|
|
109
|
+
"""Create one GKE Cluster network.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
args: user provided arguments for running the command.
|
|
113
|
+
index: index number for the network to be created.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
0 if successful and 1 otherwise.
|
|
117
|
+
"""
|
|
118
|
+
existing_network_names, return_code = get_all_networks_programmatic(args)
|
|
119
|
+
if return_code > 0:
|
|
120
|
+
xpk_print('Listing all networks failed!')
|
|
121
|
+
return return_code
|
|
122
|
+
|
|
123
|
+
network_name = f'{args.cluster}-net-{index}'
|
|
124
|
+
if network_name not in existing_network_names:
|
|
125
|
+
command = (
|
|
126
|
+
f'gcloud compute --project={args.project}'
|
|
127
|
+
f' networks create {network_name}'
|
|
128
|
+
' --subnet-mode=custom --mtu=8244'
|
|
129
|
+
)
|
|
130
|
+
return_code = run_command_with_updates(
|
|
131
|
+
command, 'Create Cluster Network', args, verbose=False
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if return_code != 0:
|
|
135
|
+
xpk_print(f'Create Cluster Network request returned ERROR {return_code}')
|
|
136
|
+
return 1
|
|
137
|
+
else:
|
|
138
|
+
xpk_print(f'Reusing existing network {network_name}')
|
|
139
|
+
|
|
140
|
+
return 0
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def create_cluster_subnet(args, index) -> int:
|
|
144
|
+
"""Create one GKE Cluster subnet.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
args: user provided arguments for running the command.
|
|
148
|
+
index: index number for the subnet to be created.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
0 if successful and 1 otherwise.
|
|
152
|
+
"""
|
|
153
|
+
existing_subnet_names, return_code = get_all_subnets_programmatic(args)
|
|
154
|
+
if return_code > 0:
|
|
155
|
+
xpk_print('Listing all subnets failed!')
|
|
156
|
+
return return_code
|
|
157
|
+
subnet_name = f'{args.cluster}-{zone_to_region(args.zone)}-sub-{index}'
|
|
158
|
+
if subnet_name not in existing_subnet_names:
|
|
159
|
+
command = (
|
|
160
|
+
f'gcloud compute --project={args.project}'
|
|
161
|
+
f' networks subnets create {subnet_name}'
|
|
162
|
+
f' --network={args.cluster}-net-{index}'
|
|
163
|
+
f' --region={zone_to_region(args.zone)} --range=192.168.{index}.0/24'
|
|
164
|
+
)
|
|
165
|
+
return_code = run_command_with_updates(
|
|
166
|
+
command, 'Create Cluster Subnet', args, verbose=False
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if return_code != 0:
|
|
170
|
+
xpk_print(f'Create Cluster Subnet request returned ERROR {return_code}')
|
|
171
|
+
return 1
|
|
172
|
+
else:
|
|
173
|
+
xpk_print(f'Reusing existing subnet {subnet_name}')
|
|
174
|
+
|
|
175
|
+
return 0
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
|
|
179
|
+
return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
|
|
183
|
+
return [f'{cluster_name}-sub-1'] + [
|
|
184
|
+
f'{cluster_name}-rdma-sub-{i}' for i in range(8)
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def create_cluster_firewall_rule(args, index) -> int:
|
|
189
|
+
"""Create one GKE Cluster firewall rule.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
args: user provided arguments for running the command.
|
|
193
|
+
index: index number for the firewall rule to be created.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
0 if successful and 1 otherwise.
|
|
197
|
+
"""
|
|
198
|
+
existing_firewall_rules_names, return_code = (
|
|
199
|
+
get_all_firewall_rules_programmatic(args)
|
|
200
|
+
)
|
|
201
|
+
if return_code > 0:
|
|
202
|
+
xpk_print('Listing all firewall rules failed!')
|
|
203
|
+
return return_code
|
|
204
|
+
firewall_rule_name = f'{args.cluster}-internal-{index}'
|
|
205
|
+
if firewall_rule_name not in existing_firewall_rules_names:
|
|
206
|
+
command = (
|
|
207
|
+
f'gcloud compute --project={args.project} firewall-rules create'
|
|
208
|
+
f' {firewall_rule_name} --network={args.cluster}-net-{index} --action=ALLOW'
|
|
209
|
+
' --rules=tcp:0-65535,udp:0-65535,icmp --source-ranges=192.168.0.0/16'
|
|
210
|
+
)
|
|
211
|
+
return_code = run_command_with_updates(
|
|
212
|
+
command, 'Create Cluster Firewall Rule', args, verbose=False
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if return_code != 0:
|
|
216
|
+
xpk_print(
|
|
217
|
+
f'Create Cluster Firewall Rule request returned ERROR {return_code}'
|
|
218
|
+
)
|
|
219
|
+
return 1
|
|
220
|
+
else:
|
|
221
|
+
xpk_print(f'Reusing existing firewall rule {firewall_rule_name}')
|
|
222
|
+
return 0
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def create_cluster_network_config(args) -> int:
|
|
226
|
+
"""Run the Create GKE Cluster Network Config request.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
args: user provided arguments for running the command.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
0 if successful and 1 otherwise.
|
|
233
|
+
"""
|
|
234
|
+
yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
|
|
235
|
+
tmp = write_tmp_file(yml_string)
|
|
236
|
+
command = f'kubectl apply -f {str(tmp.file.name)}'
|
|
237
|
+
|
|
238
|
+
return_code = run_command_with_updates(
|
|
239
|
+
command, 'GKE Cluster Create Network Config', args
|
|
240
|
+
)
|
|
241
|
+
if return_code != 0:
|
|
242
|
+
xpk_print(
|
|
243
|
+
f'GKE Cluster Create ConfigMap request returned ERROR {return_code}'
|
|
244
|
+
)
|
|
245
|
+
return 1
|
|
246
|
+
|
|
247
|
+
return 0
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
|
|
251
|
+
"""Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
|
|
252
|
+
Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
|
|
253
|
+
and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
args: user provided arguments for running the command.
|
|
257
|
+
system: system characteristics.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
0 if successful and 1 otherwise.
|
|
261
|
+
"""
|
|
262
|
+
num_networks = 5 if system.device_type == H100_DEVICE_TYPE else 9
|
|
263
|
+
for i in range(1, num_networks):
|
|
264
|
+
return_code = create_cluster_network(args, i)
|
|
265
|
+
if return_code != 0:
|
|
266
|
+
return 1
|
|
267
|
+
return_code = create_cluster_subnet(args, i)
|
|
268
|
+
if return_code != 0:
|
|
269
|
+
return 1
|
|
270
|
+
return_code = create_cluster_firewall_rule(args, i)
|
|
271
|
+
if return_code != 0:
|
|
272
|
+
return 1
|
|
273
|
+
return 0
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def delete_cluster_subnets(args) -> int:
|
|
277
|
+
"""Delete GKE Cluster subnets.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
args: user provided arguments for running the command.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
0 if successful and 1 otherwise.
|
|
284
|
+
"""
|
|
285
|
+
existing_subnet_names, return_code = get_all_subnets_programmatic(args)
|
|
286
|
+
if return_code > 0:
|
|
287
|
+
xpk_print('Listing all subnets failed!')
|
|
288
|
+
return return_code
|
|
289
|
+
|
|
290
|
+
for subnet_name in existing_subnet_names:
|
|
291
|
+
command = (
|
|
292
|
+
f'gcloud compute networks subnets delete {subnet_name}'
|
|
293
|
+
f' --region={zone_to_region(args.zone)} --project={args.project} --quiet'
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
return_code = run_command_with_updates(
|
|
297
|
+
command, 'Delete Cluster Subnet', args, verbose=False
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if return_code != 0:
|
|
301
|
+
xpk_print(f'Delete Cluster Subnet request returned ERROR {return_code}')
|
|
302
|
+
return 1
|
|
303
|
+
else:
|
|
304
|
+
xpk_print(f'Deleted existing subnet {subnet_name}')
|
|
305
|
+
|
|
306
|
+
return 0
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def get_all_networks_programmatic(args) -> tuple[list[str], int]:
|
|
310
|
+
"""Gets all the networks associated with project .
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
args: user provided arguments for running the command.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of networks and 0 if successful and 1 otherwise.
|
|
317
|
+
"""
|
|
318
|
+
command = 'gcloud compute networks list --format="csv[no-heading](name)"'
|
|
319
|
+
return_code, raw_network_output = run_command_for_value(
|
|
320
|
+
command, 'Get All Networks', args
|
|
321
|
+
)
|
|
322
|
+
if return_code != 0:
|
|
323
|
+
xpk_print(f'Get All Networks returned ERROR {return_code}')
|
|
324
|
+
return [], 1
|
|
325
|
+
|
|
326
|
+
return raw_network_output.splitlines(), 0
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def get_all_subnets_programmatic(args) -> tuple[list[str], int]:
|
|
330
|
+
"""Gets all the subnets associated with the project.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
args: user provided arguments for running the command.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
List of subnets and 0 if successful and 1 otherwise.
|
|
337
|
+
"""
|
|
338
|
+
subnet_name_filter = f'{args.cluster}-{zone_to_region(args.zone)}-sub-*'
|
|
339
|
+
|
|
340
|
+
command = (
|
|
341
|
+
'gcloud compute networks subnets list'
|
|
342
|
+
f' --filter=name~"{subnet_name_filter}" --project={args.project}'
|
|
343
|
+
)
|
|
344
|
+
return_code, raw_subnets_output = run_command_for_value(
|
|
345
|
+
command, 'Get All Subnets', args
|
|
346
|
+
)
|
|
347
|
+
if return_code != 0:
|
|
348
|
+
xpk_print(f'Get All Subnets returned ERROR {return_code}')
|
|
349
|
+
return [], 1
|
|
350
|
+
|
|
351
|
+
all_outputs = raw_subnets_output.splitlines()
|
|
352
|
+
all_networks = [
|
|
353
|
+
all_outputs[i].split(' ')[0] for i in range(1, len(all_outputs))
|
|
354
|
+
]
|
|
355
|
+
return all_networks, 0
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
|
|
359
|
+
"""Gets all the firewall rules associated with the project.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
args: user provided arguments for running the command.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
List of firewall rules and 0 if successful and 1 otherwise.
|
|
366
|
+
"""
|
|
367
|
+
command = (
|
|
368
|
+
'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
|
|
369
|
+
)
|
|
370
|
+
return_code, raw_subnets_output = run_command_for_value(
|
|
371
|
+
command, 'Get All Firewall Rules', args
|
|
372
|
+
)
|
|
373
|
+
if return_code != 0:
|
|
374
|
+
xpk_print(f'Get All Firewall Rules returned ERROR {return_code}')
|
|
375
|
+
return [], 1
|
|
376
|
+
|
|
377
|
+
return raw_subnets_output.splitlines(), 0
|