xpk 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/api/__init__.py +15 -0
  3. xpk/api/storage_crd.yaml +52 -0
  4. xpk/commands/__init__.py +15 -0
  5. xpk/commands/batch.py +131 -0
  6. xpk/commands/cluster.py +808 -0
  7. xpk/commands/cluster_gcluster.py +269 -0
  8. xpk/commands/common.py +44 -0
  9. xpk/commands/config.py +29 -0
  10. xpk/commands/info.py +243 -0
  11. xpk/commands/inspector.py +357 -0
  12. xpk/commands/job.py +199 -0
  13. xpk/commands/kind.py +283 -0
  14. xpk/commands/kjob_common.py +44 -0
  15. xpk/commands/run.py +128 -0
  16. xpk/commands/shell.py +140 -0
  17. xpk/commands/storage.py +267 -0
  18. xpk/commands/version.py +27 -0
  19. xpk/commands/workload.py +889 -0
  20. xpk/core/__init__.py +15 -0
  21. xpk/core/blueprint/__init__.py +15 -0
  22. xpk/core/blueprint/blueprint_definitions.py +62 -0
  23. xpk/core/blueprint/blueprint_generator.py +708 -0
  24. xpk/core/capacity.py +185 -0
  25. xpk/core/cluster.py +564 -0
  26. xpk/core/cluster_private.py +200 -0
  27. xpk/core/commands.py +356 -0
  28. xpk/core/config.py +179 -0
  29. xpk/core/docker_container.py +225 -0
  30. xpk/core/docker_image.py +210 -0
  31. xpk/core/docker_manager.py +308 -0
  32. xpk/core/docker_resources.py +350 -0
  33. xpk/core/filestore.py +251 -0
  34. xpk/core/gcloud_context.py +196 -0
  35. xpk/core/gcluster_manager.py +176 -0
  36. xpk/core/gcsfuse.py +50 -0
  37. xpk/core/kjob.py +444 -0
  38. xpk/core/kueue.py +358 -0
  39. xpk/core/monitoring.py +134 -0
  40. xpk/core/nap.py +361 -0
  41. xpk/core/network.py +377 -0
  42. xpk/core/nodepool.py +581 -0
  43. xpk/core/pathways.py +377 -0
  44. xpk/core/ray.py +222 -0
  45. xpk/core/remote_state/__init__.py +15 -0
  46. xpk/core/remote_state/fuse_remote_state.py +99 -0
  47. xpk/core/remote_state/remote_state_client.py +38 -0
  48. xpk/core/resources.py +238 -0
  49. xpk/core/scheduling.py +253 -0
  50. xpk/core/storage.py +581 -0
  51. xpk/core/system_characteristics.py +1432 -0
  52. xpk/core/vertex.py +105 -0
  53. xpk/core/workload.py +341 -0
  54. xpk/core/workload_decorators/__init__.py +15 -0
  55. xpk/core/workload_decorators/rdma_decorator.py +129 -0
  56. xpk/core/workload_decorators/storage_decorator.py +52 -0
  57. xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
  58. xpk/main.py +75 -0
  59. xpk/parser/__init__.py +15 -0
  60. xpk/parser/batch.py +43 -0
  61. xpk/parser/cluster.py +662 -0
  62. xpk/parser/common.py +259 -0
  63. xpk/parser/config.py +49 -0
  64. xpk/parser/core.py +135 -0
  65. xpk/parser/info.py +64 -0
  66. xpk/parser/inspector.py +65 -0
  67. xpk/parser/job.py +147 -0
  68. xpk/parser/kind.py +95 -0
  69. xpk/parser/run.py +47 -0
  70. xpk/parser/shell.py +59 -0
  71. xpk/parser/storage.py +316 -0
  72. xpk/parser/validators.py +39 -0
  73. xpk/parser/version.py +23 -0
  74. xpk/parser/workload.py +726 -0
  75. xpk/templates/__init__.py +15 -0
  76. xpk/templates/storage.yaml +13 -0
  77. xpk/utils/__init__.py +15 -0
  78. xpk/utils/console.py +55 -0
  79. xpk/utils/file.py +82 -0
  80. xpk/utils/gcs_utils.py +125 -0
  81. xpk/utils/kubectl.py +57 -0
  82. xpk/utils/network.py +168 -0
  83. xpk/utils/objects.py +88 -0
  84. xpk/utils/templates.py +28 -0
  85. xpk/utils/validation.py +80 -0
  86. xpk/utils/yaml.py +30 -0
  87. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/METADATA +456 -32
  88. xpk-0.7.0.dist-info/RECORD +92 -0
  89. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/WHEEL +1 -1
  90. xpk-0.7.0.dist-info/entry_points.txt +2 -0
  91. xpk-0.5.0.dist-info/RECORD +0 -7
  92. xpk-0.5.0.dist-info/entry_points.txt +0 -2
  93. xpk.py +0 -7282
  94. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/LICENSE +0 -0
  95. {xpk-0.5.0.dist-info → xpk-0.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,708 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import os
18
+ import shutil
19
+ from typing import Optional
20
+
21
+ from ruamel import yaml
22
+
23
+ from ...utils.console import xpk_exit, xpk_print
24
+ from ...utils.file import ensure_directory_exists
25
+ from ..capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, CapacityType
26
+ from ..system_characteristics import get_system_characteristics_by_device_type
27
+ from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
28
+
29
+ yaml = yaml.YAML()
30
+
31
+ a3mega_device_type = H100_MEGA_DEVICE_TYPE
32
+ a3ultra_device_type = H200_DEVICE_TYPE
33
+ supported_device_types = {a3mega_device_type, a3ultra_device_type}
34
+ blueprint_dependencies_dir = {
35
+ a3mega_device_type: "src/xpk/blueprints/a3mega",
36
+ a3ultra_device_type: "src/xpk/blueprints/a3ultra",
37
+ }
38
+
39
+ cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
40
+ cluster_toolkit_version = "v1.45.1"
41
+
42
+
43
+ def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
44
+ return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
45
+
46
+
47
+ def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
48
+ return [f"{cluster_name}-sub-1"] + [
49
+ f"{cluster_name}-rdma-sub-{i}" for i in range(8)
50
+ ]
51
+
52
+
53
+ class BlueprintGeneratorOutput:
54
+ """BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
55
+ Atributes:
56
+ - blueprint_file (str) : path to generated blueprint file.
57
+ - blueprint_dependencies (str) : path to directory containing blueprint dependencies.
58
+ """
59
+
60
+ def __init__(self, blueprint_file: str, blueprint_dependencies: str) -> None:
61
+ self.blueprint_file = blueprint_file
62
+ self.blueprint_dependencies = blueprint_dependencies
63
+
64
+
65
+ class BlueprintGenerator:
66
+ """BlueprintGenerator is a class for generating blueprints
67
+ Atributes:
68
+ - storage_path (str) - path to directory where generated files and directories will be stored.
69
+ """
70
+
71
+ def __init__(self, storage_path: str) -> None:
72
+ self.storage_path = storage_path
73
+
74
+ def generate_a3_mega_blueprint(
75
+ self,
76
+ cluster_name: str,
77
+ project_id: str,
78
+ blueprint_name: str,
79
+ region: str,
80
+ zone: str,
81
+ auth_cidr: str,
82
+ prefix: str = "",
83
+ num_nodes: int = 2,
84
+ pods_ip_cidr_range: str = "10.4.0.0/14",
85
+ services_ip_cidr_range: str = "10.0.32.0/20",
86
+ global_ip_address_range: str = "192.169.0.0/16",
87
+ system_node_pool_machine_type: str = "e2-standard-32",
88
+ primary_vpc_name: str = "network1",
89
+ gpu_subnets_name: str = "gpunets",
90
+ group_placement_max_distance: int = 2,
91
+ subnetwork_cidr_suffix: int = 24,
92
+ reservation: str | None = None,
93
+ gcs_bucket: Optional[str | None] = None,
94
+ capacity_type: CapacityType = CapacityType.ON_DEMAND,
95
+ system_node_pool_min_node_count: int = 2,
96
+ ) -> BlueprintGeneratorOutput:
97
+ """Create A3 mega blueprint and directory containing its dependencies.
98
+
99
+ Returns:
100
+ - BlueprintGeneratorOutput object containing path to blueprint and its dependencies.
101
+ """
102
+ xpk_print(f"Generating {blueprint_name} blueprint started...")
103
+ system, _ = get_system_characteristics_by_device_type(a3mega_device_type)
104
+ if system is None:
105
+ xpk_print(
106
+ "Error: Could not retrieve system characteristics for"
107
+ f" {a3mega_device_type} device_type."
108
+ )
109
+ xpk_exit(1)
110
+ subnetwork_name = f"{cluster_name}-xpk-gke-a3-megagpu-subnet"
111
+ primary_vpc = DeploymentModule(
112
+ id=primary_vpc_name,
113
+ source="modules/network/vpc",
114
+ settings={
115
+ "subnetwork_name": subnetwork_name,
116
+ "secondary_ranges": {
117
+ subnetwork_name: [
118
+ {"range_name": "pods", "ip_cidr_range": pods_ip_cidr_range},
119
+ {
120
+ "range_name": "services",
121
+ "ip_cidr_range": services_ip_cidr_range,
122
+ },
123
+ ]
124
+ },
125
+ },
126
+ )
127
+ gpunets = DeploymentModule(
128
+ id=gpu_subnets_name,
129
+ source="modules/network/multivpc",
130
+ settings={
131
+ "network_name_prefix": f"{cluster_name}-gpunet",
132
+ "global_ip_address_range": global_ip_address_range,
133
+ "network_count": 8,
134
+ "subnetwork_cidr_suffix": subnetwork_cidr_suffix,
135
+ },
136
+ )
137
+
138
+ gke_cluster = DeploymentModule(
139
+ id="gke_cluster",
140
+ source="modules/scheduler/gke-cluster",
141
+ use=[primary_vpc_name, gpu_subnets_name],
142
+ settings={
143
+ "release_channel": "RAPID",
144
+ "prefix_with_deployment_name": False,
145
+ "name_suffix": cluster_name,
146
+ "enable_private_endpoint": False,
147
+ "enable_gcsfuse_csi": True,
148
+ "enable_filestore_csi": True,
149
+ "master_authorized_networks": [{
150
+ "cidr_block": (
151
+ f"{auth_cidr}"
152
+ ), # Allows your machine run kubectl command. It's required for the multi-network setup.
153
+ "display_name": "kubectl-access-network",
154
+ }],
155
+ "system_node_pool_machine_type": system_node_pool_machine_type,
156
+ "system_node_pool_node_count": {
157
+ "total_min_nodes": system_node_pool_min_node_count,
158
+ "total_max_nodes": 1000,
159
+ },
160
+ },
161
+ outputs=["instructions"],
162
+ )
163
+
164
+ group_placement_0 = DeploymentModule(
165
+ id="group_placement_0",
166
+ source="modules/compute/resource-policy",
167
+ settings={
168
+ "name": f"{cluster_name}-gp-np-0",
169
+ "group_placement_max_distance": group_placement_max_distance,
170
+ },
171
+ )
172
+
173
+ a3_megagpu_pool_0 = DeploymentModule(
174
+ id="a3_megagpu_pool_0",
175
+ source="modules/compute/gke-node-pool",
176
+ use=["gke_cluster", gpu_subnets_name, "group_placement_0"],
177
+ settings={
178
+ "name": f"{cluster_name}-a3-megagpu-pool-0",
179
+ "machine_type": system.gce_machine_type,
180
+ "static_node_count": num_nodes,
181
+ "zones": [zone],
182
+ "host_maintenance_interval": "PERIODIC",
183
+ "reservation_affinity": self._getblock_reservation_affinity(
184
+ reservation
185
+ ),
186
+ "run_workload_script": False,
187
+ "spot": capacity_type == CapacityType.SPOT,
188
+ "max_pods_per_node": 32,
189
+ "auto_upgrade": True,
190
+ },
191
+ outputs=["instructions"],
192
+ )
193
+ num_chips = num_nodes * system.chips_per_vm
194
+ workload = DeploymentModule(
195
+ id="workload_component_install",
196
+ source="modules/management/kubectl-apply",
197
+ use=["gke_cluster"],
198
+ settings={
199
+ "kueue": {
200
+ "install": True,
201
+ "version": "v0.10.0", # TAS feature-gates is enabled in CT
202
+ "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
203
+ "config_template_vars": {"num_chips": f"{num_chips}"},
204
+ },
205
+ "jobset": {"install": True, "version": "v0.7.2"},
206
+ "apply_manifests": [{
207
+ "source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
208
+ }],
209
+ },
210
+ )
211
+
212
+ workload_configmap = DeploymentModule(
213
+ id="workload_configmap",
214
+ source="modules/management/kubectl-apply",
215
+ use=["gke_cluster"],
216
+ settings={
217
+ "apply_manifests": [{
218
+ "source": (
219
+ f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
220
+ ),
221
+ "template_vars": {
222
+ "resource_config_name": (
223
+ f"{cluster_name}-resources-configmap"
224
+ ),
225
+ "num_nodes": f"{num_nodes}",
226
+ "cluster_config_name": f"{cluster_name}-metadata-configmap",
227
+ "capacity_type": f"{capacity_type.value}",
228
+ "reservation": f"{reservation}",
229
+ },
230
+ }]
231
+ },
232
+ )
233
+ primary_group = DeploymentGroup(
234
+ group="primary",
235
+ modules=[
236
+ primary_vpc,
237
+ gpunets,
238
+ gke_cluster,
239
+ group_placement_0,
240
+ a3_megagpu_pool_0,
241
+ workload,
242
+ workload_configmap,
243
+ ],
244
+ )
245
+ a3_mega_blueprint = Blueprint(
246
+ terraform_backend_defaults=self._getblock_terraform_backend(
247
+ gcs_bucket, prefix
248
+ ),
249
+ blueprint_name=blueprint_name,
250
+ toolkit_modules_url=cluster_toolkit_url,
251
+ toolkit_modules_version=cluster_toolkit_version,
252
+ deployment_groups=[primary_group],
253
+ vars={
254
+ "project_id": project_id,
255
+ "deployment_name": blueprint_name,
256
+ "region": region,
257
+ "zone": zone,
258
+ },
259
+ )
260
+
261
+ blueprint_file_path = self._save_blueprint_to_file(
262
+ blueprint_name, a3_mega_blueprint, prefix
263
+ )
264
+ blueprint_dependencies = self._get_a3_mega_blueprint_dependencies(
265
+ blueprint_name, prefix
266
+ )
267
+ xpk_print(f"Blueprint file path: {blueprint_file_path}")
268
+ xpk_print(
269
+ f"Blueprint dependencies directory path: {blueprint_dependencies}"
270
+ )
271
+ xpk_print(f"The {blueprint_name} blueprint generated.")
272
+ return BlueprintGeneratorOutput(
273
+ blueprint_file=blueprint_file_path,
274
+ blueprint_dependencies=blueprint_dependencies,
275
+ )
276
+
277
+ def generate_gke_ml_blueprint(
278
+ self,
279
+ cluster_name: str,
280
+ blueprint_name: str,
281
+ project_id: str,
282
+ region: str,
283
+ auth_cidr: str,
284
+ prefix: str = "",
285
+ gcs_bucket: Optional[str | None] = None,
286
+ ) -> BlueprintGeneratorOutput:
287
+ """Create a simple gke cluster
288
+
289
+ Returns:
290
+ Blueprint: blueprint of simple cluster to create. This blueprint doesn't have any dependencies.
291
+ """
292
+
293
+ network1 = DeploymentModule(
294
+ id="network1",
295
+ source="modules/network/vpc",
296
+ settings={
297
+ "subnetwork_name": f"{blueprint_name}-gke-subnet",
298
+ "secondary_ranges": {
299
+ f"{blueprint_name}-gke-subnet": [
300
+ {"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
301
+ {
302
+ "range_name": "services",
303
+ "ip_cidr_range": "10.0.32.0/20",
304
+ },
305
+ ]
306
+ },
307
+ },
308
+ )
309
+
310
+ gke_cluster = DeploymentModule(
311
+ id="gke_cluster",
312
+ source="modules/scheduler/gke-cluster",
313
+ use=["network1"],
314
+ settings={
315
+ "prefix_with_deployment_name": False,
316
+ "name_suffix": cluster_name,
317
+ "enable_private_endpoint": (
318
+ "false"
319
+ ), # Allows for access from authorized public IPs
320
+ "master_authorized_networks": [{
321
+ "display_name": "deployment-machine",
322
+ "cidr_block": auth_cidr,
323
+ }],
324
+ },
325
+ outputs=["instructions"],
326
+ )
327
+
328
+ primary_group = DeploymentGroup(
329
+ group="primary",
330
+ modules=[network1, gke_cluster],
331
+ )
332
+ ml_gke = Blueprint(
333
+ terraform_backend_defaults=self._getblock_terraform_backend(
334
+ gcs_bucket, prefix
335
+ ),
336
+ blueprint_name=blueprint_name,
337
+ toolkit_modules_url=cluster_toolkit_url,
338
+ toolkit_modules_version=cluster_toolkit_version,
339
+ deployment_groups=[primary_group],
340
+ vars={
341
+ "project_id": project_id,
342
+ "deployment_name": blueprint_name,
343
+ "region": region,
344
+ },
345
+ )
346
+
347
+ blueprint_file_path = self._save_blueprint_to_file(
348
+ blueprint_name, ml_gke, prefix
349
+ )
350
+ blueprint_dependencies = ""
351
+ return BlueprintGeneratorOutput(
352
+ blueprint_file=blueprint_file_path,
353
+ blueprint_dependencies=blueprint_dependencies,
354
+ )
355
+
356
+ def generate_a3_ultra_blueprint(
357
+ self,
358
+ project_id: str,
359
+ cluster_name: str,
360
+ blueprint_name: str,
361
+ region: str,
362
+ zone: str,
363
+ auth_cidr: str,
364
+ system_node_pool_machine_type: str,
365
+ reservation: Optional[str | None] = None,
366
+ gcs_bucket: Optional[str | None] = None,
367
+ num_nodes: int = 2,
368
+ enable_filestore_csi_driver=True,
369
+ prefix: str = "",
370
+ mtu_size: int = 8896,
371
+ system_node_pool_min_node_count: int = 2,
372
+ capacity_type: CapacityType = CapacityType.ON_DEMAND,
373
+ ) -> BlueprintGeneratorOutput:
374
+ """Create A3 ultra blueprint.
375
+
376
+ Args:
377
+ Returns:
378
+ - Blueprint representing cluster toolkit blueprint
379
+ """
380
+
381
+ nccl_installer_path = (
382
+ f'$(ghpc_stage("{blueprint_name}"))/nccl-installer.yaml'
383
+ )
384
+ mlgru_disable_path = f'$(ghpc_stage("{blueprint_name}"))/mlgru-disable.yaml'
385
+ net_0_id = f"{cluster_name}-net-0"
386
+ gpu_net_0 = DeploymentModule(
387
+ id=net_0_id,
388
+ source="modules/network/vpc",
389
+ settings={
390
+ "network_name": f"{cluster_name}-net-0",
391
+ "subnetworks": [{
392
+ "subnet_name": f"{cluster_name}-sub-0",
393
+ "subnet_region": region,
394
+ "subnet_ip": "192.168.0.0/18",
395
+ }],
396
+ "secondary_ranges_list": [{
397
+ "subnetwork_name": f"{cluster_name}-sub-0",
398
+ "ranges": [
399
+ {"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
400
+ {"range_name": "services", "ip_cidr_range": "10.0.32.0/20"},
401
+ ],
402
+ }],
403
+ "firewall_rules": [{
404
+ "name": f"{cluster_name}-internal-0",
405
+ "ranges": ["192.168.0.0/16"],
406
+ "allow": [
407
+ {"protocol": "tcp", "ports": ["0-65535"]},
408
+ {"protocol": "udp", "ports": ["0-65535"]},
409
+ {"protocol": "icmp"},
410
+ ],
411
+ }],
412
+ },
413
+ )
414
+ net_1_id = f"{cluster_name}-net-1"
415
+ gpu_net_1 = DeploymentModule(
416
+ id=net_1_id,
417
+ source="modules/network/vpc",
418
+ settings={
419
+ "network_name": f"{cluster_name}-net-1",
420
+ "mtu": mtu_size,
421
+ "subnetworks": [{
422
+ "subnet_name": f"{cluster_name}-sub-1",
423
+ "subnet_region": region,
424
+ "subnet_ip": "192.168.64.0/18",
425
+ }],
426
+ "firewall_rules": [{
427
+ "name": f"{cluster_name}-internal-1",
428
+ "ranges": ["192.168.0.0/16"],
429
+ "allow": [
430
+ {"protocol": "tcp", "ports": ["0-65535"]},
431
+ {"protocol": "udp", "ports": ["0-65535"]},
432
+ {"protocol": "icmp"},
433
+ ],
434
+ }],
435
+ },
436
+ )
437
+ rma_net_id = f"{cluster_name}-rdma-net"
438
+ rma_net = DeploymentModule(
439
+ id=rma_net_id,
440
+ source="modules/network/gpu-rdma-vpc",
441
+ settings={
442
+ "network_name": f"{cluster_name}-rdma-net",
443
+ "mtu": mtu_size,
444
+ "network_profile": f"https://www.googleapis.com/compute/beta/projects/{project_id}/global/networkProfiles/{zone}-vpc-roce",
445
+ "network_routing_mode": "REGIONAL",
446
+ "subnetworks_template": {
447
+ "name_prefix": f"{cluster_name}-rdma-sub",
448
+ "count": 8,
449
+ "ip_range": "192.168.128.0/18",
450
+ "region": region,
451
+ },
452
+ },
453
+ )
454
+ cluster_id = f"{cluster_name}-a3-ultragpu-cluster"
455
+ a3_ultra_cluster = DeploymentModule(
456
+ id=cluster_id,
457
+ source="modules/scheduler/gke-cluster",
458
+ use=[net_0_id],
459
+ settings={
460
+ "release_channel": "RAPID",
461
+ "version_prefix": "1.31.",
462
+ "maintenance_exclusions": [{
463
+ "name": "no-minor-or-node-upgrades-indefinite",
464
+ "start_time": "2024-12-01T00:00:00Z",
465
+ "end_time": "2025-12-22T00:00:00Z",
466
+ "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
467
+ }],
468
+ "prefix_with_deployment_name": False,
469
+ "name_suffix": cluster_name,
470
+ "system_node_pool_machine_type": system_node_pool_machine_type,
471
+ "enable_dcgm_monitoring": True,
472
+ "enable_gcsfuse_csi": True,
473
+ "enable_filestore_csi": enable_filestore_csi_driver,
474
+ "enable_private_endpoint": False,
475
+ "master_authorized_networks": [{
476
+ "cidr_block": auth_cidr,
477
+ "display_name": "kubectl-access-network",
478
+ }],
479
+ "system_node_pool_node_count": {
480
+ "total_min_nodes": system_node_pool_min_node_count,
481
+ "total_max_nodes": 1000,
482
+ },
483
+ "additional_networks": (
484
+ f"$(concat([{{network={cluster_name}-net-1.network_name,"
485
+ f" subnetwork={cluster_name}-net-1.subnetwork_name,"
486
+ f' subnetwork_project="{project_id}", nic_type="GVNIC",'
487
+ " queue_count=null, network_ip=null, stack_type=null,"
488
+ " access_config=[{nat_ip=null, public_ptr_domain_name=null,"
489
+ " network_tier=null}], ipv6_access_config=[],"
490
+ " alias_ip_range=[]}],"
491
+ f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
492
+ ),
493
+ },
494
+ outputs=["instructions"],
495
+ )
496
+ system, _ = get_system_characteristics_by_device_type(a3ultra_device_type)
497
+ if system is None:
498
+ xpk_print(
499
+ "Error: Could not retrieve system characteristics for"
500
+ f" {a3ultra_device_type} device_type."
501
+ )
502
+ xpk_exit(1)
503
+ gpu_pool = DeploymentModule(
504
+ id=f"{cluster_name}-a3u-pool",
505
+ source="modules/compute/gke-node-pool",
506
+ use=[cluster_id],
507
+ settings={
508
+ "machine_type": system.gce_machine_type,
509
+ "auto_upgrade": True,
510
+ "zones": [zone],
511
+ "static_node_count": num_nodes,
512
+ "spot": capacity_type == CapacityType.SPOT,
513
+ "reservation_affinity": self._getblock_reservation_affinity(
514
+ reservation
515
+ ),
516
+ "max_pods_per_node": 32,
517
+ "guest_accelerator": [{
518
+ "type": "nvidia-h200-141gb",
519
+ "count": 8,
520
+ "gpu_driver_installation_config": {
521
+ "gpu_driver_version": "LATEST"
522
+ },
523
+ }],
524
+ "additional_networks": (
525
+ f"$(concat([{{network={cluster_name}-net-1.network_name,"
526
+ f" subnetwork={cluster_name}-net-1.subnetwork_name,"
527
+ f' subnetwork_project="{project_id}", nic_type="GVNIC",'
528
+ " queue_count=null, network_ip=null, stack_type=null,"
529
+ " access_config=[{nat_ip=null, public_ptr_domain_name=null,"
530
+ " network_tier=null}], ipv6_access_config=[],"
531
+ " alias_ip_range=[]}],"
532
+ f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
533
+ ),
534
+ },
535
+ outputs=["instructions"],
536
+ )
537
+
538
+ num_chips = num_nodes * system.chips_per_vm
539
+ workload_manager_install_id = "workload-manager-install"
540
+ workload_manager_install = DeploymentModule(
541
+ id=workload_manager_install_id,
542
+ source="modules/management/kubectl-apply",
543
+ use=[cluster_id],
544
+ settings={
545
+ "kueue": {
546
+ "install": True,
547
+ "version": "v0.10.0", # TAS feature-gates is enabled in CT
548
+ "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
549
+ "config_template_vars": {"num_chips": f"{num_chips}"},
550
+ },
551
+ "jobset": {"install": True, "version": "v0.7.2"},
552
+ "apply_manifests": [
553
+ {"source": nccl_installer_path},
554
+ {"source": mlgru_disable_path},
555
+ {
556
+ "source": (
557
+ f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
558
+ )
559
+ },
560
+ ],
561
+ },
562
+ )
563
+
564
+ workload_configmap = DeploymentModule(
565
+ id="workload_configmap",
566
+ source="modules/management/kubectl-apply",
567
+ use=[cluster_id],
568
+ settings={
569
+ "apply_manifests": [{
570
+ "source": (
571
+ f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
572
+ ),
573
+ "template_vars": {
574
+ "resource_config_name": (
575
+ f"{cluster_name}-resources-configmap"
576
+ ),
577
+ "num_nodes": f"{num_nodes}",
578
+ "cluster_config_name": f"{cluster_name}-metadata-configmap",
579
+ "capacity_type": f"{capacity_type.value}",
580
+ "reservation": f"{reservation}",
581
+ },
582
+ }]
583
+ },
584
+ )
585
+
586
+ primary_group = DeploymentGroup(
587
+ group="primary",
588
+ modules=[
589
+ gpu_net_0,
590
+ gpu_net_1,
591
+ rma_net,
592
+ a3_ultra_cluster,
593
+ gpu_pool,
594
+ workload_manager_install,
595
+ workload_configmap,
596
+ ],
597
+ )
598
+ a3_ultra_blueprint = Blueprint(
599
+ terraform_backend_defaults=self._getblock_terraform_backend(
600
+ gcs_bucket, prefix
601
+ ),
602
+ blueprint_name=blueprint_name,
603
+ toolkit_modules_url=cluster_toolkit_url,
604
+ toolkit_modules_version=cluster_toolkit_version,
605
+ deployment_groups=[primary_group],
606
+ vars={
607
+ "project_id": project_id,
608
+ "deployment_name": blueprint_name,
609
+ "region": region,
610
+ "zone": zone,
611
+ },
612
+ )
613
+
614
+ blueprint_file_path = self._save_blueprint_to_file(
615
+ blueprint_name, a3_ultra_blueprint, prefix
616
+ )
617
+ blueprint_dependencies = self._get_a3_ultra_blueprint_dependencies(
618
+ blueprint_name, prefix
619
+ )
620
+ return BlueprintGeneratorOutput(
621
+ blueprint_file=blueprint_file_path,
622
+ blueprint_dependencies=blueprint_dependencies,
623
+ )
624
+
625
+ def _getblock_reservation_affinity(
626
+ self, reservation: str | None = None
627
+ ) -> dict:
628
+ return (
629
+ {
630
+ "consume_reservation_type": "NO_RESERVATION",
631
+ "specific_reservations": [],
632
+ }
633
+ if reservation is None
634
+ else {
635
+ "consume_reservation_type": "SPECIFIC_RESERVATION",
636
+ "specific_reservations": [{"name": reservation}],
637
+ }
638
+ )
639
+
640
+ def _getblock_terraform_backend(
641
+ self, gcs_bucket: str, prefix: str = ""
642
+ ) -> dict | None:
643
+ if gcs_bucket is None:
644
+ return None
645
+ return {
646
+ "type": "gcs",
647
+ "configuration": {
648
+ "bucket": gcs_bucket,
649
+ "prefix": self._get_terraforrm_backend_full_prefix(prefix),
650
+ },
651
+ }
652
+
653
+ def _get_terraforrm_backend_full_prefix(self, prefix: str = "") -> str:
654
+ return f"xpk_terraform_state/{prefix}/tfstate/"
655
+
656
+ def _save_blueprint_to_file(
657
+ self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
658
+ ) -> str:
659
+ blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
660
+ with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
661
+ yaml.dump(xpk_blueprint, blueprint_file)
662
+ return blueprint_path
663
+
664
+ def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
665
+ blueprint_path = os.path.join(
666
+ self._get_storage_path(prefix), f"{blueprint_name}.yaml"
667
+ )
668
+ return blueprint_path
669
+
670
+ def _get_storage_path(self, prefix):
671
+ storage_path_with_prefix = os.path.join(self.storage_path, prefix)
672
+ ensure_directory_exists(storage_path_with_prefix)
673
+ return storage_path_with_prefix
674
+
675
+ def blueprint_exists(self, blueprint_name, prefix: str = ""):
676
+ blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
677
+ return os.path.exists(blueprint_path)
678
+
679
+ def _get_a3_mega_blueprint_dependencies(
680
+ self, blueprint_name: str, prefix: str = ""
681
+ ) -> str:
682
+ deployment_files_path = os.path.join(
683
+ self._get_storage_path(prefix), blueprint_name
684
+ )
685
+ shutil.copytree(
686
+ blueprint_dependencies_dir[a3mega_device_type],
687
+ deployment_files_path,
688
+ dirs_exist_ok=True,
689
+ )
690
+ return deployment_files_path
691
+
692
+ def _get_a3_ultra_blueprint_dependencies(
693
+ self, blueprint_name: str, prefix: str = ""
694
+ ) -> str:
695
+ deployment_files_path = os.path.join(
696
+ self._get_storage_path(prefix), blueprint_name
697
+ )
698
+ shutil.copytree(
699
+ blueprint_dependencies_dir[a3ultra_device_type],
700
+ deployment_files_path,
701
+ dirs_exist_ok=True,
702
+ )
703
+ return deployment_files_path
704
+
705
+
706
+ yaml.register_class(Blueprint)
707
+ yaml.register_class(DeploymentGroup)
708
+ yaml.register_class(DeploymentModule)