xpk 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. xpk/__init__.py +15 -0
  2. xpk/commands/__init__.py +15 -0
  3. xpk/commands/batch.py +109 -0
  4. xpk/commands/cluster.py +784 -0
  5. xpk/commands/cluster_gcluster.py +185 -0
  6. xpk/commands/info.py +245 -0
  7. xpk/commands/inspector.py +363 -0
  8. xpk/commands/job.py +197 -0
  9. xpk/commands/kind.py +253 -0
  10. xpk/commands/shell.py +120 -0
  11. xpk/commands/version.py +39 -0
  12. xpk/commands/workload.py +692 -0
  13. xpk/core/__init__.py +15 -0
  14. xpk/core/blueprint/__init__.py +15 -0
  15. xpk/core/blueprint/blueprint_definitions.py +61 -0
  16. xpk/core/blueprint/blueprint_generator.py +652 -0
  17. xpk/core/cluster_private.py +197 -0
  18. xpk/core/commands.py +352 -0
  19. xpk/core/core.py +2824 -0
  20. xpk/core/docker_manager.py +308 -0
  21. xpk/core/gcluster_manager.py +158 -0
  22. xpk/core/kjob.py +205 -0
  23. xpk/core/kueue.py +352 -0
  24. xpk/core/nap.py +349 -0
  25. xpk/core/pathways.py +298 -0
  26. xpk/core/ray.py +222 -0
  27. xpk/core/system_characteristics.py +1395 -0
  28. xpk/core/workload.py +133 -0
  29. xpk/core/workload_decorators/__init__.py +15 -0
  30. xpk/core/workload_decorators/rdma_decorator.py +109 -0
  31. xpk/core/workload_decorators/tcpxo_decorator.py +157 -0
  32. xpk/main.py +73 -0
  33. xpk/parser/__init__.py +15 -0
  34. xpk/parser/batch.py +184 -0
  35. xpk/parser/cluster.py +621 -0
  36. xpk/parser/common.py +71 -0
  37. xpk/parser/core.py +109 -0
  38. xpk/parser/info.py +63 -0
  39. xpk/parser/inspector.py +65 -0
  40. xpk/parser/job.py +126 -0
  41. xpk/parser/kind.py +94 -0
  42. xpk/parser/shell.py +50 -0
  43. xpk/parser/validators.py +39 -0
  44. xpk/parser/version.py +23 -0
  45. xpk/parser/workload.py +684 -0
  46. xpk/utils/__init__.py +15 -0
  47. xpk/utils/console.py +55 -0
  48. xpk/utils/file.py +82 -0
  49. xpk/utils/network.py +168 -0
  50. xpk/utils/objects.py +85 -0
  51. xpk/utils/yaml.py +30 -0
  52. {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/METADATA +307 -38
  53. xpk-0.6.0.dist-info/RECORD +57 -0
  54. {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/WHEEL +1 -1
  55. xpk-0.6.0.dist-info/entry_points.txt +2 -0
  56. xpk-0.4.0.dist-info/RECORD +0 -7
  57. xpk-0.4.0.dist-info/entry_points.txt +0 -2
  58. xpk.py +0 -7218
  59. {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/LICENSE +0 -0
  60. {xpk-0.4.0.dist-info → xpk-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,652 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import shutil
18
+ from typing import Optional
19
+ from ruamel import yaml
20
+ import os
21
+
22
+ from .blueprint_definitions import DeploymentGroup, DeploymentModule, Blueprint
23
+ from ..system_characteristics import get_system_characteristics_by_device_type
24
+ from ...utils.console import xpk_print, xpk_exit
25
+ from ...utils.file import ensure_directory_exists
26
+ from ..core import CapacityType, h100_mega_device_type, h200_device_type
27
+
28
+ yaml = yaml.YAML()
29
+
30
+ a3mega_device_type = h100_mega_device_type
31
+ a3ultra_device_type = h200_device_type
32
+ supported_device_types = {a3mega_device_type, a3ultra_device_type}
33
+ blueprint_dependencies_dir = {
34
+ a3mega_device_type: "src/xpk/blueprints/a3mega",
35
+ a3ultra_device_type: "src/xpk/blueprints/a3ultra",
36
+ }
37
+
38
+ cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
39
+ cluster_toolkit_version = "v1.45.1"
40
+
41
+
42
+ class BlueprintGeneratorOutput:
43
+ """BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
44
+ Atributes:
45
+ - blueprint_file (str) : path to generated blueprint file.
46
+ - blueprint_dependencies (str) : path to directory containing blueprint dependencies.
47
+ """
48
+
49
+ def __init__(self, blueprint_file: str, blueprint_dependencies: str) -> None:
50
+ self.blueprint_file = blueprint_file
51
+ self.blueprint_dependencies = blueprint_dependencies
52
+
53
+
54
+ class BlueprintGenerator:
55
+ """BlueprintGenerator is a class for generating blueprints
56
+ Atributes:
57
+ - storage_path (str) - path to directory where generated files and directories will be stored.
58
+ """
59
+
60
+ def __init__(self, storage_path: str) -> None:
61
+ self.storage_path = storage_path
62
+
63
+ def generate_a3_mega_blueprint(
64
+ self,
65
+ cluster_name: str,
66
+ project_id: str,
67
+ blueprint_name: str,
68
+ region: str,
69
+ zone: str,
70
+ auth_cidr: str,
71
+ prefix: str = "",
72
+ num_nodes: int = 2,
73
+ pods_ip_cidr_range: str = "10.4.0.0/14",
74
+ services_ip_cidr_range: str = "10.0.32.0/20",
75
+ global_ip_address_range: str = "192.169.0.0/16",
76
+ system_node_pool_machine_type: str = "e2-standard-32",
77
+ primary_vpc_name: str = "network1",
78
+ gpu_subnets_name: str = "gpunets",
79
+ group_placement_max_distance: int = 2,
80
+ subnetwork_cidr_suffix: int = 24,
81
+ reservation: str | None = None,
82
+ capacity_type: CapacityType = CapacityType.ON_DEMAND,
83
+ system_node_pool_min_node_count: int = 2,
84
+ ) -> BlueprintGeneratorOutput:
85
+ """Create A3 mega blueprint and directory containing its dependencies.
86
+
87
+ Returns:
88
+ - BlueprintGeneratorOutput object containing path to blueprint and its dependencies.
89
+ """
90
+ xpk_print(f"Generating {blueprint_name} blueprint started...")
91
+ system, _ = get_system_characteristics_by_device_type(a3mega_device_type)
92
+ if system is None:
93
+ xpk_print(
94
+ "Error: Could not retrieve system characteristics for"
95
+ f" {a3mega_device_type} device_type."
96
+ )
97
+ xpk_exit(1)
98
+ subnetwork_name = f"{cluster_name}-xpk-gke-a3-megagpu-subnet"
99
+ primary_vpc = DeploymentModule(
100
+ id=primary_vpc_name,
101
+ source="modules/network/vpc",
102
+ settings={
103
+ "subnetwork_name": subnetwork_name,
104
+ "secondary_ranges": {
105
+ subnetwork_name: [
106
+ {"range_name": "pods", "ip_cidr_range": pods_ip_cidr_range},
107
+ {
108
+ "range_name": "services",
109
+ "ip_cidr_range": services_ip_cidr_range,
110
+ },
111
+ ]
112
+ },
113
+ },
114
+ )
115
+ gpunets = DeploymentModule(
116
+ id=gpu_subnets_name,
117
+ source="modules/network/multivpc",
118
+ settings={
119
+ "network_name_prefix": f"{cluster_name}-gpunet",
120
+ "global_ip_address_range": global_ip_address_range,
121
+ "network_count": 8,
122
+ "subnetwork_cidr_suffix": subnetwork_cidr_suffix,
123
+ },
124
+ )
125
+
126
+ gke_cluster = DeploymentModule(
127
+ id="gke_cluster",
128
+ source="modules/scheduler/gke-cluster",
129
+ use=[primary_vpc_name, gpu_subnets_name],
130
+ settings={
131
+ "release_channel": "RAPID",
132
+ "prefix_with_deployment_name": False,
133
+ "name_suffix": cluster_name,
134
+ "enable_private_endpoint": False,
135
+ "master_authorized_networks": [{
136
+ "cidr_block": (
137
+ f"{auth_cidr}"
138
+ ), # Allows your machine run kubectl command. It's required for the multi-network setup.
139
+ "display_name": "kubectl-access-network",
140
+ }],
141
+ "system_node_pool_machine_type": system_node_pool_machine_type,
142
+ "system_node_pool_node_count": {
143
+ "total_min_nodes": system_node_pool_min_node_count,
144
+ "total_max_nodes": 1000,
145
+ },
146
+ },
147
+ outputs=["instructions"],
148
+ )
149
+
150
+ group_placement_0 = DeploymentModule(
151
+ id="group_placement_0",
152
+ source="modules/compute/resource-policy",
153
+ settings={
154
+ "name": f"{cluster_name}-gp-np-0",
155
+ "group_placement_max_distance": group_placement_max_distance,
156
+ },
157
+ )
158
+
159
+ reservation_affinity = (
160
+ {
161
+ "consume_reservation_type": "NO_RESERVATION",
162
+ "specific_reservations": [],
163
+ }
164
+ if reservation is None
165
+ else {
166
+ "consume_reservation_type": "SPECIFIC_RESERVATION",
167
+ "specific_reservations": [{"name": reservation}],
168
+ }
169
+ )
170
+
171
+ a3_megagpu_pool_0 = DeploymentModule(
172
+ id="a3_megagpu_pool_0",
173
+ source="modules/compute/gke-node-pool",
174
+ use=["gke_cluster", gpu_subnets_name, "group_placement_0"],
175
+ settings={
176
+ "name": f"{cluster_name}-a3-megagpu-pool-0",
177
+ "machine_type": system.gce_machine_type,
178
+ "static_node_count": num_nodes,
179
+ "zones": [zone],
180
+ "host_maintenance_interval": "PERIODIC",
181
+ "reservation_affinity": reservation_affinity,
182
+ "run_workload_script": False,
183
+ "spot": capacity_type == CapacityType.SPOT,
184
+ "max_pods_per_node": 32,
185
+ "auto_upgrade": True,
186
+ },
187
+ outputs=["instructions"],
188
+ )
189
+ num_chips = num_nodes * system.chips_per_vm
190
+ workload = DeploymentModule(
191
+ id="workload_component_install",
192
+ source="modules/management/kubectl-apply",
193
+ use=["gke_cluster"],
194
+ settings={
195
+ "kueue": {
196
+ "install": True,
197
+ "version": "v0.10.0", # TAS feature-gates is enabled in CT
198
+ "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
199
+ "config_template_vars": {"num_chips": f"{num_chips}"},
200
+ },
201
+ "jobset": {"install": True, "version": "v0.7.2"},
202
+ },
203
+ )
204
+
205
+ workload_configmap = DeploymentModule(
206
+ id="workload_configmap",
207
+ source="modules/management/kubectl-apply",
208
+ use=["gke_cluster"],
209
+ settings={
210
+ "apply_manifests": [{
211
+ "source": (
212
+ f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
213
+ ),
214
+ "template_vars": {
215
+ "resource_config_name": (
216
+ f"{cluster_name}-resources-configmap"
217
+ ),
218
+ "num_nodes": f"{num_nodes}",
219
+ "cluster_config_name": f"{cluster_name}-metadata-configmap",
220
+ "capacity_type": f"{capacity_type.value}",
221
+ "reservation": f"{reservation}",
222
+ },
223
+ }]
224
+ },
225
+ )
226
+ primary_group = DeploymentGroup(
227
+ group="primary",
228
+ modules=[
229
+ primary_vpc,
230
+ gpunets,
231
+ gke_cluster,
232
+ group_placement_0,
233
+ a3_megagpu_pool_0,
234
+ workload,
235
+ workload_configmap,
236
+ ],
237
+ )
238
+ xpk_blueprint = Blueprint(
239
+ blueprint_name=blueprint_name,
240
+ toolkit_modules_url=cluster_toolkit_url,
241
+ toolkit_modules_version=cluster_toolkit_version,
242
+ deployment_groups=[primary_group],
243
+ vars={
244
+ "project_id": project_id,
245
+ "deployment_name": blueprint_name,
246
+ "region": region,
247
+ "zone": zone,
248
+ },
249
+ )
250
+ blueprint_file_path = self._save_blueprint_to_file(
251
+ blueprint_name, xpk_blueprint, prefix
252
+ )
253
+ blueprint_dependencies = self._get_a3_mega_blueprint_dependencies(
254
+ blueprint_name, prefix
255
+ )
256
+ xpk_print(f"Blueprint file path: {blueprint_file_path}")
257
+ xpk_print(
258
+ f"Blueprint dependencies directory path: {blueprint_dependencies}"
259
+ )
260
+ xpk_print(f"The {blueprint_name} blueprint generated.")
261
+ return BlueprintGeneratorOutput(
262
+ blueprint_file=blueprint_file_path,
263
+ blueprint_dependencies=blueprint_dependencies,
264
+ )
265
+
266
+ def generate_gke_ml_blueprint(
267
+ self,
268
+ cluster_name: str,
269
+ blueprint_name: str,
270
+ project_id: str,
271
+ region: str,
272
+ auth_cidr: str,
273
+ prefix: str = "",
274
+ ) -> BlueprintGeneratorOutput:
275
+ """Create a simple gke cluster
276
+
277
+ Returns:
278
+ Blueprint: blueprint of simple cluster to create. This blueprint doesn't have any dependencies.
279
+ """
280
+
281
+ network1 = DeploymentModule(
282
+ id="network1",
283
+ source="modules/network/vpc",
284
+ settings={
285
+ "subnetwork_name": f"{blueprint_name}-gke-subnet",
286
+ "secondary_ranges": {
287
+ f"{blueprint_name}-gke-subnet": [
288
+ {"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
289
+ {
290
+ "range_name": "services",
291
+ "ip_cidr_range": "10.0.32.0/20",
292
+ },
293
+ ]
294
+ },
295
+ },
296
+ )
297
+
298
+ gke_cluster = DeploymentModule(
299
+ id="gke_cluster",
300
+ source="modules/scheduler/gke-cluster",
301
+ use=["network1"],
302
+ settings={
303
+ "prefix_with_deployment_name": False,
304
+ "name_suffix": cluster_name,
305
+ "enable_private_endpoint": (
306
+ "false"
307
+ ), # Allows for access from authorized public IPs
308
+ "master_authorized_networks": [{
309
+ "display_name": "deployment-machine",
310
+ "cidr_block": auth_cidr,
311
+ }],
312
+ },
313
+ outputs=["instructions"],
314
+ )
315
+
316
+ primary_group = DeploymentGroup(
317
+ group="primary",
318
+ modules=[network1, gke_cluster],
319
+ )
320
+ ml_gke = Blueprint(
321
+ blueprint_name=blueprint_name,
322
+ toolkit_modules_url=cluster_toolkit_url,
323
+ toolkit_modules_version=cluster_toolkit_version,
324
+ deployment_groups=[primary_group],
325
+ vars={
326
+ "project_id": project_id,
327
+ "deployment_name": blueprint_name,
328
+ "region": region,
329
+ },
330
+ )
331
+ blueprint_file_path = self._save_blueprint_to_file(
332
+ blueprint_name, ml_gke, prefix
333
+ )
334
+ blueprint_dependencies = ""
335
+ return BlueprintGeneratorOutput(
336
+ blueprint_file=blueprint_file_path,
337
+ blueprint_dependencies=blueprint_dependencies,
338
+ )
339
+
340
+ def _save_blueprint_to_file(
341
+ self, blueprint_name: str, xpk_blueprint: Blueprint, prefix: str = ""
342
+ ) -> str:
343
+ blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
344
+ with open(blueprint_path, "w+", encoding="utf-8") as blueprint_file:
345
+ yaml.dump(xpk_blueprint, blueprint_file)
346
+ return blueprint_path
347
+
348
+ def _get_blueprint_path(self, blueprint_name, prefix: str = ""):
349
+ blueprint_path = os.path.join(
350
+ self._get_storage_path(prefix), f"{blueprint_name}.yaml"
351
+ )
352
+ return blueprint_path
353
+
354
+ def _get_storage_path(self, prefix):
355
+ storage_path_with_prefix = os.path.join(self.storage_path, prefix)
356
+ ensure_directory_exists(storage_path_with_prefix)
357
+ return storage_path_with_prefix
358
+
359
+ def blueprint_exists(self, blueprint_name, prefix: str = ""):
360
+ blueprint_path = self._get_blueprint_path(blueprint_name, prefix)
361
+ return os.path.exists(blueprint_path)
362
+
363
+ def _get_a3_mega_blueprint_dependencies(
364
+ self, blueprint_name: str, prefix: str = ""
365
+ ) -> str:
366
+ deployment_files_path = os.path.join(
367
+ self._get_storage_path(prefix), blueprint_name
368
+ )
369
+ shutil.copytree(
370
+ blueprint_dependencies_dir[a3mega_device_type],
371
+ deployment_files_path,
372
+ dirs_exist_ok=True,
373
+ )
374
+ return deployment_files_path
375
+
376
+ def _get_a3_ultra_blueprint_dependencies(
377
+ self, blueprint_name: str, prefix: str = ""
378
+ ) -> str:
379
+ deployment_files_path = os.path.join(
380
+ self._get_storage_path(prefix), blueprint_name
381
+ )
382
+ shutil.copytree(
383
+ blueprint_dependencies_dir[a3ultra_device_type],
384
+ deployment_files_path,
385
+ dirs_exist_ok=True,
386
+ )
387
+ return deployment_files_path
388
+
389
+ def generate_a3_ultra_blueprint(
390
+ self,
391
+ project_id: str,
392
+ cluster_name: str,
393
+ blueprint_name: str,
394
+ region: str,
395
+ zone: str,
396
+ auth_cidr: str,
397
+ system_node_pool_machine_type: str,
398
+ reservation: Optional[str | None] = None,
399
+ num_nodes: int = 2,
400
+ prefix: str = "",
401
+ mtu_size: int = 8896,
402
+ system_node_pool_min_node_count: int = 2,
403
+ capacity_type: CapacityType = CapacityType.ON_DEMAND,
404
+ ) -> BlueprintGeneratorOutput:
405
+ """Create A3 ultra blueprint.
406
+
407
+ Args:
408
+ Returns:
409
+ - Blueprint representing cluster toolkit blueprint
410
+ """
411
+
412
+ nccl_installer_path = (
413
+ f'$(ghpc_stage("{blueprint_name}"))/nccl-installer.yaml'
414
+ )
415
+ mlgru_disable_path = f'$(ghpc_stage("{blueprint_name}"))/mlgru-disable.yaml'
416
+ net_0_id = f"{cluster_name}-net-0"
417
+ gpu_net_0 = DeploymentModule(
418
+ id=net_0_id,
419
+ source="modules/network/vpc",
420
+ settings={
421
+ "network_name": f"{cluster_name}-net-0",
422
+ "subnetworks": [{
423
+ "subnet_name": f"{cluster_name}-sub-0",
424
+ "subnet_region": region,
425
+ "subnet_ip": "192.168.0.0/18",
426
+ }],
427
+ "secondary_ranges_list": [{
428
+ "subnetwork_name": f"{cluster_name}-sub-0",
429
+ "ranges": [
430
+ {"range_name": "pods", "ip_cidr_range": "10.4.0.0/14"},
431
+ {"range_name": "services", "ip_cidr_range": "10.0.32.0/20"},
432
+ ],
433
+ }],
434
+ "firewall_rules": [{
435
+ "name": f"{cluster_name}-internal-0",
436
+ "ranges": ["192.168.0.0/16"],
437
+ "allow": [
438
+ {"protocol": "tcp", "ports": ["0-65535"]},
439
+ {"protocol": "udp", "ports": ["0-65535"]},
440
+ {"protocol": "icmp"},
441
+ ],
442
+ }],
443
+ },
444
+ )
445
+ net_1_id = f"{cluster_name}-net-1"
446
+ gpu_net_1 = DeploymentModule(
447
+ id=net_1_id,
448
+ source="modules/network/vpc",
449
+ settings={
450
+ "network_name": f"{cluster_name}-net-1",
451
+ "mtu": mtu_size,
452
+ "subnetworks": [{
453
+ "subnet_name": f"{cluster_name}-sub-1",
454
+ "subnet_region": region,
455
+ "subnet_ip": "192.168.64.0/18",
456
+ }],
457
+ "firewall_rules": [{
458
+ "name": f"{cluster_name}-internal-1",
459
+ "ranges": ["192.168.0.0/16"],
460
+ "allow": [
461
+ {"protocol": "tcp", "ports": ["0-65535"]},
462
+ {"protocol": "udp", "ports": ["0-65535"]},
463
+ {"protocol": "icmp"},
464
+ ],
465
+ }],
466
+ },
467
+ )
468
+ rma_net_id = f"{cluster_name}-rdma-net"
469
+ rma_net = DeploymentModule(
470
+ id=rma_net_id,
471
+ source="modules/network/gpu-rdma-vpc",
472
+ settings={
473
+ "network_name": f"{cluster_name}-rdma-net",
474
+ "mtu": mtu_size,
475
+ "network_profile": f"https://www.googleapis.com/compute/beta/projects/{project_id}/global/networkProfiles/{zone}-vpc-roce",
476
+ "network_routing_mode": "REGIONAL",
477
+ "subnetworks_template": {
478
+ "name_prefix": f"{cluster_name}-rdma-sub",
479
+ "count": 8,
480
+ "ip_range": "192.168.128.0/18",
481
+ "region": region,
482
+ },
483
+ },
484
+ )
485
+ cluster_id = f"{cluster_name}-a3-ultragpu-cluster"
486
+ a3_ultra_cluster = DeploymentModule(
487
+ id=cluster_id,
488
+ source="modules/scheduler/gke-cluster",
489
+ use=[net_0_id],
490
+ settings={
491
+ "release_channel": "RAPID",
492
+ "version_prefix": "1.31.",
493
+ "maintenance_exclusions": [{
494
+ "name": "no-minor-or-node-upgrades-indefinite",
495
+ "start_time": "2024-12-01T00:00:00Z",
496
+ "end_time": "2025-12-22T00:00:00Z",
497
+ "exclusion_scope": "NO_MINOR_OR_NODE_UPGRADES",
498
+ }],
499
+ "prefix_with_deployment_name": False,
500
+ "name_suffix": cluster_name,
501
+ "system_node_pool_machine_type": system_node_pool_machine_type,
502
+ "enable_dcgm_monitoring": True,
503
+ "enable_gcsfuse_csi": True,
504
+ "enable_private_endpoint": False,
505
+ "master_authorized_networks": [{
506
+ "cidr_block": auth_cidr,
507
+ "display_name": "kubectl-access-network",
508
+ }],
509
+ "system_node_pool_node_count": {
510
+ "total_min_nodes": system_node_pool_min_node_count,
511
+ "total_max_nodes": 1000,
512
+ },
513
+ "additional_networks": (
514
+ f"$(concat([{{network={cluster_name}-net-1.network_name,"
515
+ f" subnetwork={cluster_name}-net-1.subnetwork_name,"
516
+ f' subnetwork_project="{project_id}", nic_type="GVNIC",'
517
+ " queue_count=null, network_ip=null, stack_type=null,"
518
+ " access_config=[{nat_ip=null, public_ptr_domain_name=null,"
519
+ " network_tier=null}], ipv6_access_config=[],"
520
+ " alias_ip_range=[]}],"
521
+ f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
522
+ ),
523
+ },
524
+ outputs=["instructions"],
525
+ )
526
+ system, _ = get_system_characteristics_by_device_type(a3ultra_device_type)
527
+ if system is None:
528
+ xpk_print(
529
+ "Error: Could not retrieve system characteristics for"
530
+ f" {a3ultra_device_type} device_type."
531
+ )
532
+ xpk_exit(1)
533
+ gpu_pool = DeploymentModule(
534
+ id=f"{cluster_name}-a3u-pool",
535
+ source="modules/compute/gke-node-pool",
536
+ use=[cluster_id],
537
+ settings={
538
+ "machine_type": system.gce_machine_type,
539
+ "auto_upgrade": True,
540
+ "zones": [zone],
541
+ "static_node_count": num_nodes,
542
+ "spot": capacity_type == CapacityType.SPOT,
543
+ "max_pods_per_node": 32,
544
+ "guest_accelerator": [{
545
+ "type": "nvidia-h200-141gb",
546
+ "count": 8,
547
+ "gpu_driver_installation_config": {
548
+ "gpu_driver_version": "LATEST"
549
+ },
550
+ }],
551
+ "additional_networks": (
552
+ f"$(concat([{{network={cluster_name}-net-1.network_name,"
553
+ f" subnetwork={cluster_name}-net-1.subnetwork_name,"
554
+ f' subnetwork_project="{project_id}", nic_type="GVNIC",'
555
+ " queue_count=null, network_ip=null, stack_type=null,"
556
+ " access_config=[{nat_ip=null, public_ptr_domain_name=null,"
557
+ " network_tier=null}], ipv6_access_config=[],"
558
+ " alias_ip_range=[]}],"
559
+ f" {cluster_name}-rdma-net.subnetwork_interfaces_gke))"
560
+ ),
561
+ },
562
+ outputs=["instructions"],
563
+ )
564
+ if reservation is not None:
565
+ gpu_pool.settings["reservation_affinity"] = {
566
+ "consume_reservation_type": "SPECIFIC_RESERVATION",
567
+ "specific_reservations": [{"name": reservation}],
568
+ }
569
+
570
+ num_chips = num_nodes * system.chips_per_vm
571
+ workload_manager_install_id = "workload-manager-install"
572
+ workload_manager_install = DeploymentModule(
573
+ id=workload_manager_install_id,
574
+ source="modules/management/kubectl-apply",
575
+ use=[cluster_id],
576
+ settings={
577
+ "kueue": {
578
+ "install": True,
579
+ "version": "v0.10.0", # TAS feature-gates is enabled in CT
580
+ "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
581
+ "config_template_vars": {"num_chips": f"{num_chips}"},
582
+ },
583
+ "jobset": {"install": True, "version": "v0.7.2"},
584
+ "apply_manifests": [
585
+ {"source": nccl_installer_path},
586
+ {"source": mlgru_disable_path},
587
+ ],
588
+ },
589
+ )
590
+
591
+ workload_configmap = DeploymentModule(
592
+ id="workload_configmap",
593
+ source="modules/management/kubectl-apply",
594
+ use=[cluster_id],
595
+ settings={
596
+ "apply_manifests": [{
597
+ "source": (
598
+ f'$(ghpc_stage("{blueprint_name}"))/config-map.yaml.tftpl'
599
+ ),
600
+ "template_vars": {
601
+ "resource_config_name": (
602
+ f"{cluster_name}-resources-configmap"
603
+ ),
604
+ "num_nodes": f"{num_nodes}",
605
+ "cluster_config_name": f"{cluster_name}-metadata-configmap",
606
+ "capacity_type": f"{capacity_type.value}",
607
+ "reservation": f"{reservation}",
608
+ },
609
+ }]
610
+ },
611
+ )
612
+
613
+ primary_group = DeploymentGroup(
614
+ group="primary",
615
+ modules=[
616
+ gpu_net_0,
617
+ gpu_net_1,
618
+ rma_net,
619
+ a3_ultra_cluster,
620
+ gpu_pool,
621
+ workload_manager_install,
622
+ workload_configmap,
623
+ ],
624
+ )
625
+ a3_ultra_blueprint = Blueprint(
626
+ blueprint_name=blueprint_name,
627
+ toolkit_modules_url=cluster_toolkit_url,
628
+ toolkit_modules_version=cluster_toolkit_version,
629
+ deployment_groups=[primary_group],
630
+ vars={
631
+ "project_id": project_id,
632
+ "deployment_name": blueprint_name,
633
+ "region": region,
634
+ "zone": zone,
635
+ },
636
+ )
637
+
638
+ blueprint_file_path = self._save_blueprint_to_file(
639
+ blueprint_name, a3_ultra_blueprint, prefix
640
+ )
641
+ blueprint_dependencies = self._get_a3_ultra_blueprint_dependencies(
642
+ blueprint_name, prefix
643
+ )
644
+ return BlueprintGeneratorOutput(
645
+ blueprint_file=blueprint_file_path,
646
+ blueprint_dependencies=blueprint_dependencies,
647
+ )
648
+
649
+
650
+ yaml.register_class(Blueprint)
651
+ yaml.register_class(DeploymentGroup)
652
+ yaml.register_class(DeploymentModule)