xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. xpk-0.0.1.dist-info/LICENSE +202 -0
  88. xpk-0.0.1.dist-info/METADATA +1498 -0
  89. xpk-0.0.1.dist-info/RECORD +92 -0
  90. xpk-0.0.1.dist-info/WHEEL +5 -0
  91. xpk-0.0.1.dist-info/entry_points.txt +2 -0
  92. xpk-0.0.1.dist-info/top_level.txt +1 -0
xpk/core/network.py ADDED
@@ -0,0 +1,377 @@
1
+ """
2
+ Copyright 2025 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ from ..utils.console import xpk_print
18
+ from ..utils.file import write_tmp_file
19
+ from .capacity import H100_DEVICE_TYPE
20
+ from .commands import run_command_for_value, run_command_with_updates
21
+ from .gcloud_context import zone_to_region
22
+ from .system_characteristics import SystemCharacteristics
23
+
24
+ # cluster_network_yaml: the config when creating the network for a3 cluster
25
+ CLUSTER_NETWORK_YAML = """
26
+ apiVersion: networking.gke.io/v1
27
+ kind: Network
28
+ metadata:
29
+ name: vpc1
30
+ spec:
31
+ parametersRef:
32
+ group: networking.gke.io
33
+ kind: GKENetworkParamSet
34
+ name: vpc1
35
+ type: Device
36
+ ---
37
+ apiVersion: networking.gke.io/v1
38
+ kind: Network
39
+ metadata:
40
+ name: vpc2
41
+ spec:
42
+ parametersRef:
43
+ group: networking.gke.io
44
+ kind: GKENetworkParamSet
45
+ name: vpc2
46
+ type: Device
47
+ ---
48
+ apiVersion: networking.gke.io/v1
49
+ kind: Network
50
+ metadata:
51
+ name: vpc3
52
+ spec:
53
+ parametersRef:
54
+ group: networking.gke.io
55
+ kind: GKENetworkParamSet
56
+ name: vpc3
57
+ type: Device
58
+ ---
59
+ apiVersion: networking.gke.io/v1
60
+ kind: Network
61
+ metadata:
62
+ name: vpc4
63
+ spec:
64
+ parametersRef:
65
+ group: networking.gke.io
66
+ kind: GKENetworkParamSet
67
+ name: vpc4
68
+ type: Device
69
+ ---
70
+ apiVersion: networking.gke.io/v1
71
+ kind: GKENetworkParamSet
72
+ metadata:
73
+ name: vpc1
74
+ spec:
75
+ vpc: {cluster_name}-net-1
76
+ vpcSubnet: {cluster_name}-sub-1
77
+ deviceMode: NetDevice
78
+ ---
79
+ apiVersion: networking.gke.io/v1
80
+ kind: GKENetworkParamSet
81
+ metadata:
82
+ name: vpc2
83
+ spec:
84
+ vpc: {cluster_name}-net-2
85
+ vpcSubnet: {cluster_name}-sub-2
86
+ deviceMode: NetDevice
87
+ ---
88
+ apiVersion: networking.gke.io/v1
89
+ kind: GKENetworkParamSet
90
+ metadata:
91
+ name: vpc3
92
+ spec:
93
+ vpc: {cluster_name}-net-3
94
+ vpcSubnet: {cluster_name}-sub-3
95
+ deviceMode: NetDevice
96
+ ---
97
+ apiVersion: networking.gke.io/v1
98
+ kind: GKENetworkParamSet
99
+ metadata:
100
+ name: vpc4
101
+ spec:
102
+ vpc: {cluster_name}-net-4
103
+ vpcSubnet: {cluster_name}-sub-4
104
+ deviceMode: NetDevice
105
+ """
106
+
107
+
108
+ def create_cluster_network(args, index) -> int:
109
+ """Create one GKE Cluster network.
110
+
111
+ Args:
112
+ args: user provided arguments for running the command.
113
+ index: index number for the network to be created.
114
+
115
+ Returns:
116
+ 0 if successful and 1 otherwise.
117
+ """
118
+ existing_network_names, return_code = get_all_networks_programmatic(args)
119
+ if return_code > 0:
120
+ xpk_print('Listing all networks failed!')
121
+ return return_code
122
+
123
+ network_name = f'{args.cluster}-net-{index}'
124
+ if network_name not in existing_network_names:
125
+ command = (
126
+ f'gcloud compute --project={args.project}'
127
+ f' networks create {network_name}'
128
+ ' --subnet-mode=custom --mtu=8244'
129
+ )
130
+ return_code = run_command_with_updates(
131
+ command, 'Create Cluster Network', args, verbose=False
132
+ )
133
+
134
+ if return_code != 0:
135
+ xpk_print(f'Create Cluster Network request returned ERROR {return_code}')
136
+ return 1
137
+ else:
138
+ xpk_print(f'Reusing existing network {network_name}')
139
+
140
+ return 0
141
+
142
+
143
+ def create_cluster_subnet(args, index) -> int:
144
+ """Create one GKE Cluster subnet.
145
+
146
+ Args:
147
+ args: user provided arguments for running the command.
148
+ index: index number for the subnet to be created.
149
+
150
+ Returns:
151
+ 0 if successful and 1 otherwise.
152
+ """
153
+ existing_subnet_names, return_code = get_all_subnets_programmatic(args)
154
+ if return_code > 0:
155
+ xpk_print('Listing all subnets failed!')
156
+ return return_code
157
+ subnet_name = f'{args.cluster}-{zone_to_region(args.zone)}-sub-{index}'
158
+ if subnet_name not in existing_subnet_names:
159
+ command = (
160
+ f'gcloud compute --project={args.project}'
161
+ f' networks subnets create {subnet_name}'
162
+ f' --network={args.cluster}-net-{index}'
163
+ f' --region={zone_to_region(args.zone)} --range=192.168.{index}.0/24'
164
+ )
165
+ return_code = run_command_with_updates(
166
+ command, 'Create Cluster Subnet', args, verbose=False
167
+ )
168
+
169
+ if return_code != 0:
170
+ xpk_print(f'Create Cluster Subnet request returned ERROR {return_code}')
171
+ return 1
172
+ else:
173
+ xpk_print(f'Reusing existing subnet {subnet_name}')
174
+
175
+ return 0
176
+
177
+
178
+ def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
179
+ return [f'{cluster_name}-gpunet-{i}-subnet' for i in range(8)]
180
+
181
+
182
+ def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
183
+ return [f'{cluster_name}-sub-1'] + [
184
+ f'{cluster_name}-rdma-sub-{i}' for i in range(8)
185
+ ]
186
+
187
+
188
+ def create_cluster_firewall_rule(args, index) -> int:
189
+ """Create one GKE Cluster firewall rule.
190
+
191
+ Args:
192
+ args: user provided arguments for running the command.
193
+ index: index number for the firewall rule to be created.
194
+
195
+ Returns:
196
+ 0 if successful and 1 otherwise.
197
+ """
198
+ existing_firewall_rules_names, return_code = (
199
+ get_all_firewall_rules_programmatic(args)
200
+ )
201
+ if return_code > 0:
202
+ xpk_print('Listing all firewall rules failed!')
203
+ return return_code
204
+ firewall_rule_name = f'{args.cluster}-internal-{index}'
205
+ if firewall_rule_name not in existing_firewall_rules_names:
206
+ command = (
207
+ f'gcloud compute --project={args.project} firewall-rules create'
208
+ f' {firewall_rule_name} --network={args.cluster}-net-{index} --action=ALLOW'
209
+ ' --rules=tcp:0-65535,udp:0-65535,icmp --source-ranges=192.168.0.0/16'
210
+ )
211
+ return_code = run_command_with_updates(
212
+ command, 'Create Cluster Firewall Rule', args, verbose=False
213
+ )
214
+
215
+ if return_code != 0:
216
+ xpk_print(
217
+ f'Create Cluster Firewall Rule request returned ERROR {return_code}'
218
+ )
219
+ return 1
220
+ else:
221
+ xpk_print(f'Reusing existing firewall rule {firewall_rule_name}')
222
+ return 0
223
+
224
+
225
+ def create_cluster_network_config(args) -> int:
226
+ """Run the Create GKE Cluster Network Config request.
227
+
228
+ Args:
229
+ args: user provided arguments for running the command.
230
+
231
+ Returns:
232
+ 0 if successful and 1 otherwise.
233
+ """
234
+ yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
235
+ tmp = write_tmp_file(yml_string)
236
+ command = f'kubectl apply -f {str(tmp.file.name)}'
237
+
238
+ return_code = run_command_with_updates(
239
+ command, 'GKE Cluster Create Network Config', args
240
+ )
241
+ if return_code != 0:
242
+ xpk_print(
243
+ f'GKE Cluster Create ConfigMap request returned ERROR {return_code}'
244
+ )
245
+ return 1
246
+
247
+ return 0
248
+
249
+
250
+ def set_up_cluster_network_for_gpu(args, system: SystemCharacteristics) -> int:
251
+ """Set up GKE Cluster networks, subnets and firewall rules for A3/A3+.
252
+ Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node,
253
+ and there are 8 NICs for GPU-GPU bw and 1 NIC for host in an A3+ node.
254
+
255
+ Args:
256
+ args: user provided arguments for running the command.
257
+ system: system characteristics.
258
+
259
+ Returns:
260
+ 0 if successful and 1 otherwise.
261
+ """
262
+ num_networks = 5 if system.device_type == H100_DEVICE_TYPE else 9
263
+ for i in range(1, num_networks):
264
+ return_code = create_cluster_network(args, i)
265
+ if return_code != 0:
266
+ return 1
267
+ return_code = create_cluster_subnet(args, i)
268
+ if return_code != 0:
269
+ return 1
270
+ return_code = create_cluster_firewall_rule(args, i)
271
+ if return_code != 0:
272
+ return 1
273
+ return 0
274
+
275
+
276
+ def delete_cluster_subnets(args) -> int:
277
+ """Delete GKE Cluster subnets.
278
+
279
+ Args:
280
+ args: user provided arguments for running the command.
281
+
282
+ Returns:
283
+ 0 if successful and 1 otherwise.
284
+ """
285
+ existing_subnet_names, return_code = get_all_subnets_programmatic(args)
286
+ if return_code > 0:
287
+ xpk_print('Listing all subnets failed!')
288
+ return return_code
289
+
290
+ for subnet_name in existing_subnet_names:
291
+ command = (
292
+ f'gcloud compute networks subnets delete {subnet_name}'
293
+ f' --region={zone_to_region(args.zone)} --project={args.project} --quiet'
294
+ )
295
+
296
+ return_code = run_command_with_updates(
297
+ command, 'Delete Cluster Subnet', args, verbose=False
298
+ )
299
+
300
+ if return_code != 0:
301
+ xpk_print(f'Delete Cluster Subnet request returned ERROR {return_code}')
302
+ return 1
303
+ else:
304
+ xpk_print(f'Deleted existing subnet {subnet_name}')
305
+
306
+ return 0
307
+
308
+
309
+ def get_all_networks_programmatic(args) -> tuple[list[str], int]:
310
+ """Gets all the networks associated with project .
311
+
312
+ Args:
313
+ args: user provided arguments for running the command.
314
+
315
+ Returns:
316
+ List of networks and 0 if successful and 1 otherwise.
317
+ """
318
+ command = 'gcloud compute networks list --format="csv[no-heading](name)"'
319
+ return_code, raw_network_output = run_command_for_value(
320
+ command, 'Get All Networks', args
321
+ )
322
+ if return_code != 0:
323
+ xpk_print(f'Get All Networks returned ERROR {return_code}')
324
+ return [], 1
325
+
326
+ return raw_network_output.splitlines(), 0
327
+
328
+
329
+ def get_all_subnets_programmatic(args) -> tuple[list[str], int]:
330
+ """Gets all the subnets associated with the project.
331
+
332
+ Args:
333
+ args: user provided arguments for running the command.
334
+
335
+ Returns:
336
+ List of subnets and 0 if successful and 1 otherwise.
337
+ """
338
+ subnet_name_filter = f'{args.cluster}-{zone_to_region(args.zone)}-sub-*'
339
+
340
+ command = (
341
+ 'gcloud compute networks subnets list'
342
+ f' --filter=name~"{subnet_name_filter}" --project={args.project}'
343
+ )
344
+ return_code, raw_subnets_output = run_command_for_value(
345
+ command, 'Get All Subnets', args
346
+ )
347
+ if return_code != 0:
348
+ xpk_print(f'Get All Subnets returned ERROR {return_code}')
349
+ return [], 1
350
+
351
+ all_outputs = raw_subnets_output.splitlines()
352
+ all_networks = [
353
+ all_outputs[i].split(' ')[0] for i in range(1, len(all_outputs))
354
+ ]
355
+ return all_networks, 0
356
+
357
+
358
+ def get_all_firewall_rules_programmatic(args) -> tuple[list[str], int]:
359
+ """Gets all the firewall rules associated with the project.
360
+
361
+ Args:
362
+ args: user provided arguments for running the command.
363
+
364
+ Returns:
365
+ List of firewall rules and 0 if successful and 1 otherwise.
366
+ """
367
+ command = (
368
+ 'gcloud compute firewall-rules list --format="csv[no-heading](name)"'
369
+ )
370
+ return_code, raw_subnets_output = run_command_for_value(
371
+ command, 'Get All Firewall Rules', args
372
+ )
373
+ if return_code != 0:
374
+ xpk_print(f'Get All Firewall Rules returned ERROR {return_code}')
375
+ return [], 1
376
+
377
+ return raw_subnets_output.splitlines(), 0