xpk 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. integration/README.md +19 -0
  2. xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
  3. xpk/blueprints/a3mega/storage_crd.yaml +52 -0
  4. xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
  5. xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
  6. xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
  7. xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
  8. xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
  9. xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
  10. xpk/blueprints/a4/storage_crd.yaml +52 -0
  11. xpk/commands/cluster.py +33 -12
  12. xpk/commands/cluster_gcluster_test.py +5 -1
  13. xpk/commands/cluster_test.py +125 -0
  14. xpk/commands/config.py +3 -3
  15. xpk/commands/inspector.py +5 -3
  16. xpk/commands/kind.py +2 -0
  17. xpk/commands/managed_ml_diagnostics.py +249 -0
  18. xpk/commands/managed_ml_diagnostics_test.py +146 -0
  19. xpk/commands/workload.py +125 -139
  20. xpk/commands/workload_test.py +160 -118
  21. xpk/core/blueprint/blueprint_generator.py +3 -0
  22. xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
  23. xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
  24. xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
  25. xpk/core/blueprint/testing/data/a4.yaml +185 -0
  26. xpk/core/capacity.py +2 -0
  27. xpk/core/cluster.py +18 -47
  28. xpk/core/cluster_test.py +76 -1
  29. xpk/core/config.py +81 -7
  30. xpk/core/config_test.py +67 -11
  31. xpk/core/docker_container.py +3 -1
  32. xpk/core/docker_image.py +10 -6
  33. xpk/core/docker_resources.py +1 -10
  34. xpk/core/kjob.py +17 -16
  35. xpk/core/kueue_manager.py +13 -19
  36. xpk/core/kueue_manager_test.py +27 -1
  37. xpk/core/nap.py +13 -14
  38. xpk/core/nodepool.py +17 -15
  39. xpk/core/nodepool_test.py +25 -4
  40. xpk/core/pathways.py +23 -0
  41. xpk/core/pathways_test.py +57 -0
  42. xpk/core/resources.py +84 -27
  43. xpk/core/scheduling.py +128 -132
  44. xpk/core/scheduling_test.py +215 -2
  45. xpk/core/system_characteristics.py +179 -0
  46. xpk/core/system_characteristics_test.py +49 -1
  47. xpk/core/telemetry.py +4 -4
  48. xpk/core/telemetry_test.py +9 -9
  49. xpk/core/vertex.py +4 -3
  50. xpk/core/workload_decorators/tcpx_decorator.py +5 -1
  51. xpk/main.py +2 -0
  52. xpk/parser/cluster.py +22 -88
  53. xpk/parser/cluster_test.py +41 -0
  54. xpk/parser/common.py +84 -0
  55. xpk/parser/storage.py +10 -0
  56. xpk/parser/storage_test.py +47 -0
  57. xpk/parser/workload.py +14 -41
  58. xpk/parser/workload_test.py +2 -48
  59. xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
  60. xpk/utils/feature_flags.py +3 -0
  61. xpk/utils/validation.py +2 -2
  62. xpk-0.16.1.dist-info/METADATA +127 -0
  63. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/RECORD +67 -48
  64. xpk-0.15.0.dist-info/METADATA +0 -1666
  65. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/WHEEL +0 -0
  66. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/entry_points.txt +0 -0
  67. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/licenses/LICENSE +0 -0
  68. {xpk-0.15.0.dist-info → xpk-0.16.1.dist-info}/top_level.txt +0 -0
@@ -15,11 +15,29 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from dataclasses import dataclass
18
+ import dataclasses
19
+ from typing import Callable, Literal, Optional
20
+
21
+ from ..core.workload_decorators import rdma_decorator, tcpxo_decorator, tcpx_decorator
18
22
  from ..utils.topology import get_topology_product
19
23
  from enum import Enum
20
24
 
21
25
  SUB_SLICING_TOPOLOGIES = ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
22
26
 
27
+ INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
28
+ INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
29
+ INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
30
+ INSTALLER_NCCL_RDMA_A4X = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer-a4x.yaml'
31
+
32
+
33
+ class DockerPlatform(str, Enum):
34
+ AMD = 'linux/amd64'
35
+ ARM = 'linux/arm64'
36
+
37
+
38
+ AMD_PLATFORM = DockerPlatform.AMD
39
+ ARM_PLATFORM = DockerPlatform.ARM
40
+
23
41
 
24
42
  class AcceleratorType(Enum):
25
43
  TPU = 1
@@ -56,6 +74,45 @@ AcceleratorTypeToAcceleratorCharacteristics = {
56
74
  }
57
75
 
58
76
 
77
+ @dataclass
78
+ class GpuConfig:
79
+ """Contains GPU-specific configuration and requirements."""
80
+
81
+ requires_topology: bool
82
+ gpu_direct_name: Literal['fastrak', 'rdma', 'tcpx', 'tcpxo'] = 'fastrak'
83
+ kjob_decorator_fn: Optional[Callable[[dict], dict]] = None
84
+ """A function to decorate the kjob template for GPU-specific configurations.
85
+
86
+ Args:
87
+ job_manifest (dict): The kjob manifest as a dictionary.
88
+
89
+ Returns:
90
+ dict: The modified kjob manifest as a dictionary.
91
+ """
92
+ nccl_installer: Optional[str] = None
93
+ jobset_decorator_fn: Optional[Callable[[str, list[str]], str]] = None
94
+ """A function to decorate the jobset for GPU-specific configurations.
95
+
96
+ Args:
97
+ jobset_manifest_str (str): The JobSet manifest as a YAML string.
98
+ sub_networks (list[str], optional): A list of sub-network names, used by some decorators.
99
+
100
+ Returns:
101
+ str: The modified JobSet manifest as a YAML string.
102
+ """
103
+
104
+ def __repr__(self) -> str:
105
+ """Returns a string representation of the GpuConfig, omitting memory addresses for functions."""
106
+ parts = []
107
+ for f in dataclasses.fields(self):
108
+ value = getattr(self, f.name)
109
+ if f.name in ('kjob_decorator_fn', 'jobset_decorator_fn') and value:
110
+ parts.append(f'{f.name}=<function {value.__name__}>')
111
+ else:
112
+ parts.append(f'{f.name}={repr(value)}')
113
+ return f"GpuConfig({', '.join(parts)})"
114
+
115
+
59
116
  @dataclass
60
117
  class SystemCharacteristics:
61
118
  """Contains the defining characteristics of a specific accelerator system.
@@ -92,12 +149,28 @@ class SystemCharacteristics:
92
149
  accelerator_type: AcceleratorType
93
150
  device_type: str
94
151
  supports_sub_slicing: bool
152
+ docker_platform: DockerPlatform
95
153
  requires_workload_policy: bool = False
154
+ gpu_config: Optional[GpuConfig] = None
96
155
 
97
156
  def __post_init__(self):
98
157
  if self.accelerator_type == AcceleratorType.GPU:
99
158
  self.requires_workload_policy = True
100
159
 
160
+ if self.gpu_config is None:
161
+ raise ValueError(
162
+ f"Validation Error: System '{self.device_type}' is a GPU, "
163
+ "but 'gpu_config' was not provided."
164
+ )
165
+
166
+ @property
167
+ def gpu_requires_topology(self) -> bool:
168
+ """
169
+ Safely returns whether the GPU config requires topology,
170
+ defaulting to False if no GPU config exists.
171
+ """
172
+ return self.gpu_config.requires_topology if self.gpu_config else False
173
+
101
174
 
102
175
  def get_system_characteristics(
103
176
  args,
@@ -167,6 +240,7 @@ def get_tpu_system_characteristics_map(
167
240
  machine_type: str,
168
241
  supported_topologies: list[str],
169
242
  supports_sub_slicing: bool,
243
+ docker_platform: DockerPlatform,
170
244
  tpu_type_requires_workload_policy: bool = False,
171
245
  default_topologies: set[str] | None = None,
172
246
  ) -> dict[str, SystemCharacteristics]:
@@ -189,6 +263,7 @@ def get_tpu_system_characteristics_map(
189
263
  requires_workload_policy=tpu_type_requires_workload_policy
190
264
  and vms_per_slice > 1,
191
265
  supports_sub_slicing=supports_sub_slicing,
266
+ docker_platform=docker_platform,
192
267
  )
193
268
  system_characteristics_map[f'{prefix}-{topology}'] = system
194
269
  if (
@@ -231,6 +306,8 @@ UserFacingNameToSystemCharacteristics = {
231
306
  accelerator_type=AcceleratorType.GPU,
232
307
  device_type='l4-1',
233
308
  supports_sub_slicing=False,
309
+ gpu_config=GpuConfig(requires_topology=False),
310
+ docker_platform=AMD_PLATFORM,
234
311
  ),
235
312
  'l4-2': SystemCharacteristics(
236
313
  topology='N/A',
@@ -241,6 +318,8 @@ UserFacingNameToSystemCharacteristics = {
241
318
  accelerator_type=AcceleratorType.GPU,
242
319
  device_type='l4-2',
243
320
  supports_sub_slicing=False,
321
+ gpu_config=GpuConfig(requires_topology=False),
322
+ docker_platform=AMD_PLATFORM,
244
323
  ),
245
324
  'l4-4': SystemCharacteristics(
246
325
  topology='N/A',
@@ -251,6 +330,8 @@ UserFacingNameToSystemCharacteristics = {
251
330
  accelerator_type=AcceleratorType.GPU,
252
331
  device_type='l4-4',
253
332
  supports_sub_slicing=False,
333
+ gpu_config=GpuConfig(requires_topology=False),
334
+ docker_platform=AMD_PLATFORM,
254
335
  ),
255
336
  'l4-8': SystemCharacteristics(
256
337
  topology='N/A',
@@ -261,6 +342,8 @@ UserFacingNameToSystemCharacteristics = {
261
342
  accelerator_type=AcceleratorType.GPU,
262
343
  device_type='l4-8',
263
344
  supports_sub_slicing=False,
345
+ gpu_config=GpuConfig(requires_topology=False),
346
+ docker_platform=AMD_PLATFORM,
264
347
  ),
265
348
  # A100-40gb-$CHIPSc
266
349
  'a100-40gb-1': SystemCharacteristics(
@@ -272,6 +355,8 @@ UserFacingNameToSystemCharacteristics = {
272
355
  accelerator_type=AcceleratorType.GPU,
273
356
  device_type='a100-40gb-1',
274
357
  supports_sub_slicing=False,
358
+ gpu_config=GpuConfig(requires_topology=False),
359
+ docker_platform=AMD_PLATFORM,
275
360
  ),
276
361
  'a100-40gb-2': SystemCharacteristics(
277
362
  topology='N/A',
@@ -282,6 +367,8 @@ UserFacingNameToSystemCharacteristics = {
282
367
  accelerator_type=AcceleratorType.GPU,
283
368
  device_type='a100-40gb-2',
284
369
  supports_sub_slicing=False,
370
+ gpu_config=GpuConfig(requires_topology=False),
371
+ docker_platform=AMD_PLATFORM,
285
372
  ),
286
373
  'a100-40gb-4': SystemCharacteristics(
287
374
  topology='N/A',
@@ -292,6 +379,8 @@ UserFacingNameToSystemCharacteristics = {
292
379
  accelerator_type=AcceleratorType.GPU,
293
380
  device_type='a100-40gb-4',
294
381
  supports_sub_slicing=False,
382
+ gpu_config=GpuConfig(requires_topology=False),
383
+ docker_platform=AMD_PLATFORM,
295
384
  ),
296
385
  'a100-40gb-8': SystemCharacteristics(
297
386
  topology='N/A',
@@ -302,6 +391,8 @@ UserFacingNameToSystemCharacteristics = {
302
391
  accelerator_type=AcceleratorType.GPU,
303
392
  device_type='a100-40gb-8',
304
393
  supports_sub_slicing=False,
394
+ gpu_config=GpuConfig(requires_topology=False),
395
+ docker_platform=AMD_PLATFORM,
305
396
  ),
306
397
  'gb200-4': SystemCharacteristics(
307
398
  topology='1x72',
@@ -312,6 +403,14 @@ UserFacingNameToSystemCharacteristics = {
312
403
  accelerator_type=AcceleratorType.GPU,
313
404
  device_type='gb200-4',
314
405
  supports_sub_slicing=False,
406
+ gpu_config=GpuConfig(
407
+ requires_topology=True,
408
+ nccl_installer=INSTALLER_NCCL_RDMA_A4X,
409
+ kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
410
+ jobset_decorator_fn=rdma_decorator.decorate_jobset,
411
+ gpu_direct_name='rdma',
412
+ ),
413
+ docker_platform=ARM_PLATFORM,
315
414
  ),
316
415
  'gb200-4-nolssd': SystemCharacteristics(
317
416
  topology='1x72',
@@ -322,6 +421,14 @@ UserFacingNameToSystemCharacteristics = {
322
421
  accelerator_type=AcceleratorType.GPU,
323
422
  device_type='gb200-4',
324
423
  supports_sub_slicing=False,
424
+ gpu_config=GpuConfig(
425
+ requires_topology=True,
426
+ nccl_installer=INSTALLER_NCCL_RDMA_A4X,
427
+ kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
428
+ jobset_decorator_fn=rdma_decorator.decorate_jobset,
429
+ gpu_direct_name='rdma',
430
+ ),
431
+ docker_platform=ARM_PLATFORM,
325
432
  ),
326
433
  'b200-8': SystemCharacteristics(
327
434
  topology='N/A',
@@ -332,6 +439,14 @@ UserFacingNameToSystemCharacteristics = {
332
439
  accelerator_type=AcceleratorType.GPU,
333
440
  device_type='b200-8',
334
441
  supports_sub_slicing=False,
442
+ gpu_config=GpuConfig(
443
+ requires_topology=True,
444
+ nccl_installer=INSTALLER_NCCL_RDMA,
445
+ kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
446
+ jobset_decorator_fn=rdma_decorator.decorate_jobset,
447
+ gpu_direct_name='rdma',
448
+ ),
449
+ docker_platform=AMD_PLATFORM,
335
450
  ),
336
451
  'h200-141gb-8': SystemCharacteristics(
337
452
  topology='N/A',
@@ -342,6 +457,14 @@ UserFacingNameToSystemCharacteristics = {
342
457
  accelerator_type=AcceleratorType.GPU,
343
458
  device_type='h200-141gb-8',
344
459
  supports_sub_slicing=False,
460
+ gpu_config=GpuConfig(
461
+ requires_topology=True,
462
+ nccl_installer=INSTALLER_NCCL_RDMA,
463
+ kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
464
+ jobset_decorator_fn=rdma_decorator.decorate_jobset,
465
+ gpu_direct_name='rdma',
466
+ ),
467
+ docker_platform=AMD_PLATFORM,
345
468
  ),
346
469
  # H100-80gb-$CHIPS
347
470
  'h100-80gb-8': SystemCharacteristics(
@@ -353,6 +476,14 @@ UserFacingNameToSystemCharacteristics = {
353
476
  accelerator_type=AcceleratorType.GPU,
354
477
  device_type='h100-80gb-8',
355
478
  supports_sub_slicing=False,
479
+ gpu_config=GpuConfig(
480
+ requires_topology=True,
481
+ nccl_installer=INSTALLER_NCCL_TCPX,
482
+ kjob_decorator_fn=tcpx_decorator.decorate_kjob_template,
483
+ jobset_decorator_fn=tcpx_decorator.decorate_jobset,
484
+ gpu_direct_name='tcpx',
485
+ ),
486
+ docker_platform=AMD_PLATFORM,
356
487
  ),
357
488
  # H100-mega-80gb-$CHIPS
358
489
  'h100-mega-80gb-8': SystemCharacteristics(
@@ -364,6 +495,14 @@ UserFacingNameToSystemCharacteristics = {
364
495
  accelerator_type=AcceleratorType.GPU,
365
496
  device_type='h100-mega-80gb-8',
366
497
  supports_sub_slicing=False,
498
+ gpu_config=GpuConfig(
499
+ requires_topology=True,
500
+ nccl_installer=INSTALLER_NCCL_TCPXO,
501
+ kjob_decorator_fn=tcpxo_decorator.decorate_kjob_template,
502
+ jobset_decorator_fn=tcpxo_decorator.decorate_jobset,
503
+ gpu_direct_name='tcpxo',
504
+ ),
505
+ docker_platform=AMD_PLATFORM,
367
506
  ),
368
507
  # TPU system characteristics
369
508
  **get_tpu_system_characteristics_map(
@@ -374,6 +513,7 @@ UserFacingNameToSystemCharacteristics = {
374
513
  supported_topologies=['1x1x1'],
375
514
  tpu_type_requires_workload_policy=True,
376
515
  supports_sub_slicing=False,
516
+ docker_platform=AMD_PLATFORM,
377
517
  ),
378
518
  **get_tpu_system_characteristics_map(
379
519
  prefix='tpu7x',
@@ -382,6 +522,7 @@ UserFacingNameToSystemCharacteristics = {
382
522
  machine_type='tpu7x-standard-4t',
383
523
  tpu_type_requires_workload_policy=True,
384
524
  supports_sub_slicing=False,
525
+ docker_platform=AMD_PLATFORM,
385
526
  supported_topologies=generate_tpu_topologies(max_cubes=144),
386
527
  default_topologies=set([
387
528
  '12x12x12',
@@ -491,6 +632,7 @@ UserFacingNameToSystemCharacteristics = {
491
632
  machine_type='ct6e-standard-1t',
492
633
  supports_sub_slicing=False,
493
634
  supported_topologies=['1x1'],
635
+ docker_platform=AMD_PLATFORM,
494
636
  ),
495
637
  **get_tpu_system_characteristics_map(
496
638
  prefix='v6e',
@@ -501,6 +643,7 @@ UserFacingNameToSystemCharacteristics = {
501
643
  supported_topologies=[
502
644
  '2x2',
503
645
  ],
646
+ docker_platform=AMD_PLATFORM,
504
647
  ),
505
648
  **get_tpu_system_characteristics_map(
506
649
  prefix='v6e',
@@ -509,6 +652,7 @@ UserFacingNameToSystemCharacteristics = {
509
652
  machine_type='ct6e-standard-4t',
510
653
  supports_sub_slicing=True,
511
654
  supported_topologies=SUB_SLICING_TOPOLOGIES,
655
+ docker_platform=AMD_PLATFORM,
512
656
  ),
513
657
  **get_tpu_system_characteristics_map(
514
658
  prefix='v5p',
@@ -516,6 +660,7 @@ UserFacingNameToSystemCharacteristics = {
516
660
  gke_accelerator='tpu-v5p-slice',
517
661
  machine_type='ct5p-hightpu-4t',
518
662
  supports_sub_slicing=False,
663
+ docker_platform=AMD_PLATFORM,
519
664
  supported_topologies=generate_tpu_topologies(max_cubes=140),
520
665
  default_topologies=set([
521
666
  '2x2x1',
@@ -621,6 +766,7 @@ UserFacingNameToSystemCharacteristics = {
621
766
  tensorcores_per_chip=1,
622
767
  gke_accelerator='tpu-v5-lite-podslice',
623
768
  machine_type='ct5lp-hightpu-4t',
769
+ docker_platform=AMD_PLATFORM,
624
770
  supports_sub_slicing=False,
625
771
  supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
626
772
  ),
@@ -629,6 +775,7 @@ UserFacingNameToSystemCharacteristics = {
629
775
  tensorcores_per_chip=2,
630
776
  gke_accelerator='tpu-v4-podslice',
631
777
  machine_type='ct4p-hightpu-4t',
778
+ docker_platform=AMD_PLATFORM,
632
779
  supports_sub_slicing=False,
633
780
  supported_topologies=generate_tpu_topologies(
634
781
  max_cubes=64, enforce_nondecreasing=False
@@ -660,6 +807,7 @@ UserFacingNameToSystemCharacteristics = {
660
807
  accelerator_type=AcceleratorType.CPU,
661
808
  device_type='m1-megamem-96-1',
662
809
  supports_sub_slicing=False,
810
+ docker_platform=AMD_PLATFORM,
663
811
  ),
664
812
  # n2-standard-#vCPUs-#VMs
665
813
  'n2-standard-64-1': SystemCharacteristics(
@@ -671,6 +819,7 @@ UserFacingNameToSystemCharacteristics = {
671
819
  accelerator_type=AcceleratorType.CPU,
672
820
  device_type='n2-standard-64-1',
673
821
  supports_sub_slicing=False,
822
+ docker_platform=AMD_PLATFORM,
674
823
  ),
675
824
  'n2-standard-32-1': SystemCharacteristics(
676
825
  topology='N/A',
@@ -681,6 +830,7 @@ UserFacingNameToSystemCharacteristics = {
681
830
  accelerator_type=AcceleratorType.CPU,
682
831
  device_type='n2-standard-32-1',
683
832
  supports_sub_slicing=False,
833
+ docker_platform=AMD_PLATFORM,
684
834
  ),
685
835
  'n2-standard-32-2': SystemCharacteristics(
686
836
  topology='N/A',
@@ -691,6 +841,7 @@ UserFacingNameToSystemCharacteristics = {
691
841
  accelerator_type=AcceleratorType.CPU,
692
842
  device_type='n2-standard-32-2',
693
843
  supports_sub_slicing=False,
844
+ docker_platform=AMD_PLATFORM,
694
845
  ),
695
846
  'n2-standard-32-4': SystemCharacteristics(
696
847
  topology='N/A',
@@ -701,6 +852,7 @@ UserFacingNameToSystemCharacteristics = {
701
852
  accelerator_type=AcceleratorType.CPU,
702
853
  device_type='n2-standard-32-4',
703
854
  supports_sub_slicing=False,
855
+ docker_platform=AMD_PLATFORM,
704
856
  ),
705
857
  'n2-standard-32-8': SystemCharacteristics(
706
858
  topology='N/A',
@@ -711,6 +863,7 @@ UserFacingNameToSystemCharacteristics = {
711
863
  accelerator_type=AcceleratorType.CPU,
712
864
  device_type='n2-standard-32-8',
713
865
  supports_sub_slicing=False,
866
+ docker_platform=AMD_PLATFORM,
714
867
  ),
715
868
  'n2-standard-32-16': SystemCharacteristics(
716
869
  topology='N/A',
@@ -721,6 +874,7 @@ UserFacingNameToSystemCharacteristics = {
721
874
  accelerator_type=AcceleratorType.CPU,
722
875
  device_type='n2-standard-32-16',
723
876
  supports_sub_slicing=False,
877
+ docker_platform=AMD_PLATFORM,
724
878
  ),
725
879
  'n2-standard-32-32': SystemCharacteristics(
726
880
  topology='N/A',
@@ -731,6 +885,7 @@ UserFacingNameToSystemCharacteristics = {
731
885
  accelerator_type=AcceleratorType.CPU,
732
886
  device_type='n2-standard-32-32',
733
887
  supports_sub_slicing=False,
888
+ docker_platform=AMD_PLATFORM,
734
889
  ),
735
890
  'n2-standard-32-64': SystemCharacteristics(
736
891
  topology='N/A',
@@ -741,6 +896,7 @@ UserFacingNameToSystemCharacteristics = {
741
896
  accelerator_type=AcceleratorType.CPU,
742
897
  device_type='n2-standard-32-64',
743
898
  supports_sub_slicing=False,
899
+ docker_platform=AMD_PLATFORM,
744
900
  ),
745
901
  'n2-standard-32-128': SystemCharacteristics(
746
902
  topology='N/A',
@@ -751,6 +907,7 @@ UserFacingNameToSystemCharacteristics = {
751
907
  accelerator_type=AcceleratorType.CPU,
752
908
  device_type='n2-standard-32-128',
753
909
  supports_sub_slicing=False,
910
+ docker_platform=AMD_PLATFORM,
754
911
  ),
755
912
  'n2-standard-32-256': SystemCharacteristics(
756
913
  topology='N/A',
@@ -761,6 +918,7 @@ UserFacingNameToSystemCharacteristics = {
761
918
  accelerator_type=AcceleratorType.CPU,
762
919
  device_type='n2-standard-32-256',
763
920
  supports_sub_slicing=False,
921
+ docker_platform=AMD_PLATFORM,
764
922
  ),
765
923
  'n2-standard-32-512': SystemCharacteristics(
766
924
  topology='N/A',
@@ -771,6 +929,7 @@ UserFacingNameToSystemCharacteristics = {
771
929
  accelerator_type=AcceleratorType.CPU,
772
930
  device_type='n2-standard-32-512',
773
931
  supports_sub_slicing=False,
932
+ docker_platform=AMD_PLATFORM,
774
933
  ),
775
934
  'n2-standard-32-1024': SystemCharacteristics(
776
935
  topology='N/A',
@@ -781,6 +940,7 @@ UserFacingNameToSystemCharacteristics = {
781
940
  accelerator_type=AcceleratorType.CPU,
782
941
  device_type='n2-standard-32-1024',
783
942
  supports_sub_slicing=False,
943
+ docker_platform=AMD_PLATFORM,
784
944
  ),
785
945
  'n2-standard-32-2048': SystemCharacteristics(
786
946
  topology='N/A',
@@ -791,6 +951,7 @@ UserFacingNameToSystemCharacteristics = {
791
951
  accelerator_type=AcceleratorType.CPU,
792
952
  device_type='n2-standard-32-2048',
793
953
  supports_sub_slicing=False,
954
+ docker_platform=AMD_PLATFORM,
794
955
  ),
795
956
  }
796
957
  """ If you modify UserFacingNameToSystemCharacteristics you should also modify
@@ -808,3 +969,21 @@ def get_system_characteristics_keys_by_accelerator_type(
808
969
  for key, value in UserFacingNameToSystemCharacteristics.items()
809
970
  if value.accelerator_type in accelerators
810
971
  ]
972
+
973
+
974
+ def create_accelerator_label(system: SystemCharacteristics) -> str:
975
+ if system.accelerator_type == AcceleratorType.CPU:
976
+ return ''
977
+ return (
978
+ f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].accelerator_label}:'
979
+ f' {system.gke_accelerator}'
980
+ )
981
+
982
+
983
+ def create_machine_label(system: SystemCharacteristics) -> str:
984
+ if system.accelerator_type == AcceleratorType.TPU:
985
+ return (
986
+ f'{AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].machine_label}:'
987
+ f' {system.topology}'
988
+ )
989
+ return ''
@@ -14,7 +14,15 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from .system_characteristics import get_tpu_system_characteristics_map, generate_tpu_topologies, SystemCharacteristics, AcceleratorType
17
+ import pytest
18
+ from .system_characteristics import (
19
+ get_tpu_system_characteristics_map,
20
+ generate_tpu_topologies,
21
+ DockerPlatform,
22
+ SystemCharacteristics,
23
+ AcceleratorType,
24
+ GpuConfig,
25
+ )
18
26
 
19
27
 
20
28
  def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
@@ -25,6 +33,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
25
33
  machine_type="test",
26
34
  supported_topologies=["1x1"],
27
35
  supports_sub_slicing=False,
36
+ docker_platform=DockerPlatform.AMD,
28
37
  tpu_type_requires_workload_policy=False,
29
38
  )
30
39
 
@@ -37,6 +46,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
37
46
  accelerator_type=AcceleratorType.TPU,
38
47
  device_type="test-1",
39
48
  supports_sub_slicing=False,
49
+ docker_platform=DockerPlatform.AMD,
40
50
  requires_workload_policy=False,
41
51
  )
42
52
  assert result == {
@@ -53,6 +63,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
53
63
  machine_type="test",
54
64
  supported_topologies=["2x2"],
55
65
  supports_sub_slicing=False,
66
+ docker_platform=DockerPlatform.AMD,
56
67
  tpu_type_requires_workload_policy=True,
57
68
  )
58
69
 
@@ -65,6 +76,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
65
76
  accelerator_type=AcceleratorType.TPU,
66
77
  device_type="test-8",
67
78
  supports_sub_slicing=False,
79
+ docker_platform=DockerPlatform.AMD,
68
80
  requires_workload_policy=False,
69
81
  )
70
82
  assert result == {
@@ -81,6 +93,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
81
93
  machine_type="test",
82
94
  supported_topologies=["2x2x2"],
83
95
  supports_sub_slicing=False,
96
+ docker_platform=DockerPlatform.AMD,
84
97
  tpu_type_requires_workload_policy=True,
85
98
  )
86
99
 
@@ -93,6 +106,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
93
106
  accelerator_type=AcceleratorType.TPU,
94
107
  device_type="test-16",
95
108
  supports_sub_slicing=False,
109
+ docker_platform=DockerPlatform.AMD,
96
110
  requires_workload_policy=True,
97
111
  )
98
112
  assert result == {
@@ -109,6 +123,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
109
123
  machine_type="test",
110
124
  supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
111
125
  supports_sub_slicing=False,
126
+ docker_platform=DockerPlatform.AMD,
112
127
  default_topologies=set(["4x8x16"]),
113
128
  )
114
129
 
@@ -146,3 +161,36 @@ def test_generate_tpu_topologies_contains_sub_cube_slices():
146
161
  one_cube = generate_tpu_topologies(max_cubes=1)
147
162
 
148
163
  assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]
164
+
165
+
166
+ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
167
+ """Tests that __post_init__ correctly sets requires_workload_policy for GPUs."""
168
+ gpu_system = SystemCharacteristics(
169
+ topology="N/A",
170
+ vms_per_slice=1,
171
+ gke_accelerator="nvidia-l4",
172
+ gce_machine_type="g2-standard-12",
173
+ chips_per_vm=1,
174
+ accelerator_type=AcceleratorType.GPU,
175
+ device_type="l4-1",
176
+ supports_sub_slicing=False,
177
+ docker_platform=DockerPlatform.AMD,
178
+ gpu_config=GpuConfig(requires_topology=False),
179
+ )
180
+ assert gpu_system.requires_workload_policy is True
181
+
182
+
183
+ def test_system_characteristics_post_init_throws_for_gpu_without_config():
184
+ """Tests that __post_init__ raises ValueError for GPU without gpu_config."""
185
+ with pytest.raises(ValueError, match="'gpu_config' was not provided"):
186
+ SystemCharacteristics(
187
+ topology="N/A",
188
+ vms_per_slice=1,
189
+ gke_accelerator="nvidia-l4",
190
+ gce_machine_type="g2-standard-12",
191
+ chips_per_vm=1,
192
+ accelerator_type=AcceleratorType.GPU,
193
+ device_type="l4-1",
194
+ supports_sub_slicing=False,
195
+ docker_platform=DockerPlatform.AMD,
196
+ )
xpk/core/telemetry.py CHANGED
@@ -27,7 +27,7 @@ import requests
27
27
  from enum import Enum
28
28
  from typing import Any
29
29
  from dataclasses import dataclass
30
- from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
30
+ from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
31
31
  from ..utils.execution_context import is_dry_run
32
32
  from ..utils.user_agent import get_user_agent
33
33
  from ..utils.feature_flags import FeatureFlags
@@ -36,7 +36,7 @@ from ..utils.feature_flags import FeatureFlags
36
36
  def should_send_telemetry():
37
37
  return (
38
38
  FeatureFlags.TELEMETRY_ENABLED
39
- and xpk_config.get(SEND_TELEMETRY_KEY) != "false"
39
+ and get_config().get(SEND_TELEMETRY_KEY) != "false"
40
40
  )
41
41
 
42
42
 
@@ -254,10 +254,10 @@ def _get_session_id() -> str:
254
254
 
255
255
  def _ensure_client_id() -> str:
256
256
  """Generates Client ID and stores in configuration if not already present."""
257
- current_client_id = xpk_config.get(CLIENT_ID_KEY)
257
+ current_client_id = get_config().get(CLIENT_ID_KEY)
258
258
  if current_client_id is not None:
259
259
  return current_client_id
260
260
 
261
261
  new_client_id = str(uuid.uuid4())
262
- xpk_config.set(CLIENT_ID_KEY, new_client_id)
262
+ get_config().set(CLIENT_ID_KEY, new_client_id)
263
263
  return new_client_id
@@ -16,7 +16,7 @@ limitations under the License.
16
16
 
17
17
  import pytest
18
18
  import json
19
- from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
19
+ from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY
20
20
  from .telemetry import MetricsCollector, MetricsEventMetadataKey, should_send_telemetry
21
21
  from ..utils.execution_context import set_dry_run
22
22
  from ..utils.feature_flags import FeatureFlags
@@ -31,9 +31,9 @@ def setup_mocks(mocker: MockerFixture):
31
31
  mocker.patch('os.path.basename', return_value='xpk.py')
32
32
  mocker.patch('os.path.abspath', return_value='/home/xpk_user')
33
33
  set_dry_run(False)
34
- xpk_config.set(CLIENT_ID_KEY, 'client_id')
34
+ get_config().set(CLIENT_ID_KEY, 'client_id')
35
35
  yield
36
- xpk_config.set(CLIENT_ID_KEY, None)
36
+ get_config().set(CLIENT_ID_KEY, None)
37
37
 
38
38
 
39
39
  @pytest.mark.parametrize(
@@ -48,13 +48,13 @@ def setup_mocks(mocker: MockerFixture):
48
48
  def test_should_send_telemetry_returns_correct_value(
49
49
  feature_flag: bool, config_value: str, expected: bool
50
50
  ):
51
- xpk_config.set(SEND_TELEMETRY_KEY, config_value)
51
+ get_config().set(SEND_TELEMETRY_KEY, config_value)
52
52
  FeatureFlags.TELEMETRY_ENABLED = feature_flag
53
53
  assert should_send_telemetry() is expected
54
54
 
55
55
 
56
56
  def test_metrics_collector_generates_client_id_if_not_present():
57
- xpk_config.set(CLIENT_ID_KEY, None)
57
+ get_config().set(CLIENT_ID_KEY, None)
58
58
  MetricsCollector.log_start(command='test')
59
59
  payload = json.loads(MetricsCollector.flush())
60
60
  extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
@@ -79,12 +79,12 @@ def test_metrics_collector_logs_start_event_correctly():
79
79
  ],
80
80
  'event_name': 'start',
81
81
  'event_type': 'commands',
82
- 'release_version': 'v0.15.0',
82
+ 'release_version': 'v0.0.0',
83
83
  }
84
84
 
85
85
 
86
86
  def test_metrics_collector_generates_client_id_when_not_present():
87
- xpk_config.set(CLIENT_ID_KEY, None)
87
+ get_config().set(CLIENT_ID_KEY, None)
88
88
  MetricsCollector.log_start(command='test')
89
89
  payload = json.loads(MetricsCollector.flush())
90
90
  extension_json = json.loads(payload['log_event'][0]['source_extension_json'])
@@ -109,7 +109,7 @@ def test_metrics_collector_logs_complete_event_correctly():
109
109
  ],
110
110
  'event_name': 'complete',
111
111
  'event_type': 'commands',
112
- 'release_version': 'v0.15.0',
112
+ 'release_version': 'v0.0.0',
113
113
  }
114
114
 
115
115
 
@@ -132,7 +132,7 @@ def test_metrics_collector_logs_custom_event_correctly():
132
132
  ],
133
133
  'event_name': 'test',
134
134
  'event_type': 'custom',
135
- 'release_version': 'v0.15.0',
135
+ 'release_version': 'v0.0.0',
136
136
  }
137
137
 
138
138
 
xpk/core/vertex.py CHANGED
@@ -15,7 +15,7 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  from ..utils.console import xpk_print
18
- from .resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
18
+ from .resources import ConfigMapType, get_cluster_configmap
19
19
 
20
20
  DEFAULT_VERTEX_TENSORBOARD_NAME = 'tb-instance'
21
21
 
@@ -65,8 +65,9 @@ def create_vertex_experiment(args) -> dict | None:
65
65
  tensorboard,
66
66
  )
67
67
 
68
- metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
69
- cluster_config_map = get_cluster_configmap(metadata_configmap_name)
68
+ cluster_config_map = get_cluster_configmap(
69
+ args.cluster, ConfigMapType.METADATA
70
+ )
70
71
 
71
72
  if cluster_config_map is None or 'tensorboard_name' not in cluster_config_map:
72
73
  xpk_print(