xpk 0.17.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xpk/commands/cluster.py +4 -35
- xpk/commands/cluster_gcluster.py +1 -13
- xpk/commands/cluster_gcluster_test.py +2 -10
- xpk/commands/cluster_test.py +0 -4
- xpk/commands/workload.py +10 -3
- xpk/commands/workload_test.py +1 -0
- xpk/core/cluster.py +10 -9
- xpk/core/config.py +5 -17
- xpk/core/kueue_manager_test.py +2 -0
- xpk/core/nodepool.py +6 -0
- xpk/core/nodepool_test.py +4 -0
- xpk/core/scheduling.py +28 -3
- xpk/core/scheduling_test.py +38 -1
- xpk/core/system_characteristics.py +39 -16
- xpk/core/system_characteristics_test.py +11 -0
- xpk/core/workload_decorators/rdma_decorator.py +0 -15
- xpk/core/workload_decorators/tcpx_decorator.py +0 -8
- xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
- xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
- xpk/parser/common.py +0 -17
- xpk/parser/core.py +0 -39
- xpk/parser/storage.py +0 -11
- xpk/utils/feature_flags.py +1 -1
- xpk/utils/validation.py +0 -8
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/METADATA +15 -4
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/RECORD +30 -41
- xpk/commands/batch.py +0 -144
- xpk/commands/job.py +0 -244
- xpk/commands/kind.py +0 -286
- xpk/commands/kjob_common.py +0 -60
- xpk/commands/run.py +0 -140
- xpk/commands/shell.py +0 -142
- xpk/parser/batch.py +0 -43
- xpk/parser/job.py +0 -147
- xpk/parser/kind.py +0 -95
- xpk/parser/run.py +0 -47
- xpk/parser/shell.py +0 -59
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/WHEEL +0 -0
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/entry_points.txt +0 -0
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -80,15 +80,6 @@ class GpuConfig:
|
|
|
80
80
|
|
|
81
81
|
requires_topology: bool
|
|
82
82
|
gpu_direct_name: Literal['fastrak', 'rdma', 'tcpx', 'tcpxo'] = 'fastrak'
|
|
83
|
-
kjob_decorator_fn: Optional[Callable[[dict], dict]] = None
|
|
84
|
-
"""A function to decorate the kjob template for GPU-specific configurations.
|
|
85
|
-
|
|
86
|
-
Args:
|
|
87
|
-
job_manifest (dict): The kjob manifest as a dictionary.
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
dict: The modified kjob manifest as a dictionary.
|
|
91
|
-
"""
|
|
92
83
|
nccl_installer: Optional[str] = None
|
|
93
84
|
jobset_decorator_fn: Optional[Callable[[str, list[str]], str]] = None
|
|
94
85
|
"""A function to decorate the jobset for GPU-specific configurations.
|
|
@@ -106,7 +97,7 @@ class GpuConfig:
|
|
|
106
97
|
parts = []
|
|
107
98
|
for f in dataclasses.fields(self):
|
|
108
99
|
value = getattr(self, f.name)
|
|
109
|
-
if f.name in ('
|
|
100
|
+
if f.name in ('jobset_decorator_fn') and value:
|
|
110
101
|
parts.append(f'{f.name}=<function {value.__name__}>')
|
|
111
102
|
else:
|
|
112
103
|
parts.append(f'{f.name}={repr(value)}')
|
|
@@ -151,6 +142,7 @@ class SystemCharacteristics:
|
|
|
151
142
|
device_type: str
|
|
152
143
|
supports_sub_slicing: bool
|
|
153
144
|
supports_super_slicing: bool
|
|
145
|
+
supports_accelerator_network_profile: bool
|
|
154
146
|
docker_platform: DockerPlatform
|
|
155
147
|
requires_workload_policy: bool = False
|
|
156
148
|
gpu_config: Optional[GpuConfig] = None
|
|
@@ -242,6 +234,7 @@ def get_tpu_system_characteristics_map(
|
|
|
242
234
|
machine_type: str,
|
|
243
235
|
supported_topologies: list[str],
|
|
244
236
|
docker_platform: DockerPlatform,
|
|
237
|
+
supports_accelerator_network_profile: bool,
|
|
245
238
|
tpu_type_requires_workload_policy: bool = False,
|
|
246
239
|
default_topologies: set[str] | None = None,
|
|
247
240
|
sub_slicing_topologies: set[str] | None = None,
|
|
@@ -268,6 +261,7 @@ def get_tpu_system_characteristics_map(
|
|
|
268
261
|
and vms_per_slice > 1,
|
|
269
262
|
supports_sub_slicing=topology in sub_slicing_topologies,
|
|
270
263
|
supports_super_slicing=topology in super_slicing_topologies,
|
|
264
|
+
supports_accelerator_network_profile=supports_accelerator_network_profile,
|
|
271
265
|
docker_platform=docker_platform,
|
|
272
266
|
)
|
|
273
267
|
system_characteristics_map[f'{prefix}-{topology}'] = system
|
|
@@ -312,6 +306,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
312
306
|
device_type='l4-1',
|
|
313
307
|
supports_sub_slicing=False,
|
|
314
308
|
supports_super_slicing=False,
|
|
309
|
+
supports_accelerator_network_profile=False,
|
|
315
310
|
gpu_config=GpuConfig(requires_topology=False),
|
|
316
311
|
docker_platform=AMD_PLATFORM,
|
|
317
312
|
),
|
|
@@ -325,6 +320,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
325
320
|
device_type='l4-2',
|
|
326
321
|
supports_sub_slicing=False,
|
|
327
322
|
supports_super_slicing=False,
|
|
323
|
+
supports_accelerator_network_profile=False,
|
|
328
324
|
gpu_config=GpuConfig(requires_topology=False),
|
|
329
325
|
docker_platform=AMD_PLATFORM,
|
|
330
326
|
),
|
|
@@ -338,6 +334,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
338
334
|
device_type='l4-4',
|
|
339
335
|
supports_sub_slicing=False,
|
|
340
336
|
supports_super_slicing=False,
|
|
337
|
+
supports_accelerator_network_profile=False,
|
|
341
338
|
gpu_config=GpuConfig(requires_topology=False),
|
|
342
339
|
docker_platform=AMD_PLATFORM,
|
|
343
340
|
),
|
|
@@ -351,6 +348,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
351
348
|
device_type='l4-8',
|
|
352
349
|
supports_sub_slicing=False,
|
|
353
350
|
supports_super_slicing=False,
|
|
351
|
+
supports_accelerator_network_profile=False,
|
|
354
352
|
gpu_config=GpuConfig(requires_topology=False),
|
|
355
353
|
docker_platform=AMD_PLATFORM,
|
|
356
354
|
),
|
|
@@ -365,6 +363,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
365
363
|
device_type='a100-40gb-1',
|
|
366
364
|
supports_sub_slicing=False,
|
|
367
365
|
supports_super_slicing=False,
|
|
366
|
+
supports_accelerator_network_profile=False,
|
|
368
367
|
gpu_config=GpuConfig(requires_topology=False),
|
|
369
368
|
docker_platform=AMD_PLATFORM,
|
|
370
369
|
),
|
|
@@ -378,6 +377,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
378
377
|
device_type='a100-40gb-2',
|
|
379
378
|
supports_sub_slicing=False,
|
|
380
379
|
supports_super_slicing=False,
|
|
380
|
+
supports_accelerator_network_profile=False,
|
|
381
381
|
gpu_config=GpuConfig(requires_topology=False),
|
|
382
382
|
docker_platform=AMD_PLATFORM,
|
|
383
383
|
),
|
|
@@ -391,6 +391,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
391
391
|
device_type='a100-40gb-4',
|
|
392
392
|
supports_sub_slicing=False,
|
|
393
393
|
supports_super_slicing=False,
|
|
394
|
+
supports_accelerator_network_profile=False,
|
|
394
395
|
gpu_config=GpuConfig(requires_topology=False),
|
|
395
396
|
docker_platform=AMD_PLATFORM,
|
|
396
397
|
),
|
|
@@ -404,6 +405,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
404
405
|
device_type='a100-40gb-8',
|
|
405
406
|
supports_sub_slicing=False,
|
|
406
407
|
supports_super_slicing=False,
|
|
408
|
+
supports_accelerator_network_profile=False,
|
|
407
409
|
gpu_config=GpuConfig(requires_topology=False),
|
|
408
410
|
docker_platform=AMD_PLATFORM,
|
|
409
411
|
),
|
|
@@ -417,10 +419,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
417
419
|
device_type='gb200-4',
|
|
418
420
|
supports_sub_slicing=False,
|
|
419
421
|
supports_super_slicing=False,
|
|
422
|
+
supports_accelerator_network_profile=True,
|
|
420
423
|
gpu_config=GpuConfig(
|
|
421
424
|
requires_topology=True,
|
|
422
425
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
423
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
424
426
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
425
427
|
gpu_direct_name='rdma',
|
|
426
428
|
),
|
|
@@ -436,10 +438,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
436
438
|
device_type='gb200-4',
|
|
437
439
|
supports_sub_slicing=False,
|
|
438
440
|
supports_super_slicing=False,
|
|
441
|
+
supports_accelerator_network_profile=True,
|
|
439
442
|
gpu_config=GpuConfig(
|
|
440
443
|
requires_topology=True,
|
|
441
444
|
nccl_installer=INSTALLER_NCCL_RDMA_A4X,
|
|
442
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
443
445
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
444
446
|
gpu_direct_name='rdma',
|
|
445
447
|
),
|
|
@@ -455,10 +457,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
455
457
|
device_type='b200-8',
|
|
456
458
|
supports_sub_slicing=False,
|
|
457
459
|
supports_super_slicing=False,
|
|
460
|
+
supports_accelerator_network_profile=True,
|
|
458
461
|
gpu_config=GpuConfig(
|
|
459
462
|
requires_topology=True,
|
|
460
463
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
461
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
462
464
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
463
465
|
gpu_direct_name='rdma',
|
|
464
466
|
),
|
|
@@ -474,10 +476,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
474
476
|
device_type='h200-141gb-8',
|
|
475
477
|
supports_sub_slicing=False,
|
|
476
478
|
supports_super_slicing=False,
|
|
479
|
+
supports_accelerator_network_profile=True,
|
|
477
480
|
gpu_config=GpuConfig(
|
|
478
481
|
requires_topology=True,
|
|
479
482
|
nccl_installer=INSTALLER_NCCL_RDMA,
|
|
480
|
-
kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
|
|
481
483
|
jobset_decorator_fn=rdma_decorator.decorate_jobset,
|
|
482
484
|
gpu_direct_name='rdma',
|
|
483
485
|
),
|
|
@@ -494,10 +496,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
494
496
|
device_type='h100-80gb-8',
|
|
495
497
|
supports_sub_slicing=False,
|
|
496
498
|
supports_super_slicing=False,
|
|
499
|
+
supports_accelerator_network_profile=True,
|
|
497
500
|
gpu_config=GpuConfig(
|
|
498
501
|
requires_topology=True,
|
|
499
502
|
nccl_installer=INSTALLER_NCCL_TCPX,
|
|
500
|
-
kjob_decorator_fn=tcpx_decorator.decorate_kjob_template,
|
|
501
503
|
jobset_decorator_fn=tcpx_decorator.decorate_jobset,
|
|
502
504
|
gpu_direct_name='tcpx',
|
|
503
505
|
),
|
|
@@ -514,10 +516,10 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
514
516
|
device_type='h100-mega-80gb-8',
|
|
515
517
|
supports_sub_slicing=False,
|
|
516
518
|
supports_super_slicing=False,
|
|
519
|
+
supports_accelerator_network_profile=True,
|
|
517
520
|
gpu_config=GpuConfig(
|
|
518
521
|
requires_topology=True,
|
|
519
522
|
nccl_installer=INSTALLER_NCCL_TCPXO,
|
|
520
|
-
kjob_decorator_fn=tcpxo_decorator.decorate_kjob_template,
|
|
521
523
|
jobset_decorator_fn=tcpxo_decorator.decorate_jobset,
|
|
522
524
|
gpu_direct_name='tcpxo',
|
|
523
525
|
),
|
|
@@ -531,6 +533,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
531
533
|
machine_type='tpu7x-standard-1t',
|
|
532
534
|
supported_topologies=['1x1x1'],
|
|
533
535
|
tpu_type_requires_workload_policy=True,
|
|
536
|
+
supports_accelerator_network_profile=False,
|
|
534
537
|
docker_platform=AMD_PLATFORM,
|
|
535
538
|
),
|
|
536
539
|
**get_tpu_system_characteristics_map(
|
|
@@ -539,6 +542,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
539
542
|
gke_accelerator='tpu7x',
|
|
540
543
|
machine_type='tpu7x-standard-4t',
|
|
541
544
|
tpu_type_requires_workload_policy=True,
|
|
545
|
+
supports_accelerator_network_profile=False,
|
|
542
546
|
docker_platform=AMD_PLATFORM,
|
|
543
547
|
supported_topologies=generate_tpu_topologies(max_cubes=144),
|
|
544
548
|
super_slicing_topologies=set(['4x4x4']),
|
|
@@ -650,6 +654,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
650
654
|
machine_type='ct6e-standard-1t',
|
|
651
655
|
supported_topologies=['1x1'],
|
|
652
656
|
docker_platform=AMD_PLATFORM,
|
|
657
|
+
supports_accelerator_network_profile=True,
|
|
653
658
|
),
|
|
654
659
|
**get_tpu_system_characteristics_map(
|
|
655
660
|
prefix='v6e',
|
|
@@ -659,6 +664,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
659
664
|
supported_topologies=['2x2'] + SUB_SLICING_TOPOLOGIES,
|
|
660
665
|
sub_slicing_topologies=set(SUB_SLICING_TOPOLOGIES),
|
|
661
666
|
docker_platform=AMD_PLATFORM,
|
|
667
|
+
supports_accelerator_network_profile=True,
|
|
662
668
|
),
|
|
663
669
|
**get_tpu_system_characteristics_map(
|
|
664
670
|
prefix='v5p',
|
|
@@ -667,6 +673,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
667
673
|
machine_type='ct5p-hightpu-4t',
|
|
668
674
|
docker_platform=AMD_PLATFORM,
|
|
669
675
|
supported_topologies=generate_tpu_topologies(max_cubes=140),
|
|
676
|
+
supports_accelerator_network_profile=False,
|
|
670
677
|
default_topologies=set([
|
|
671
678
|
'2x2x1',
|
|
672
679
|
'2x2x2',
|
|
@@ -773,6 +780,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
773
780
|
machine_type='ct5lp-hightpu-4t',
|
|
774
781
|
docker_platform=AMD_PLATFORM,
|
|
775
782
|
supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
|
|
783
|
+
supports_accelerator_network_profile=False,
|
|
776
784
|
),
|
|
777
785
|
**get_tpu_system_characteristics_map(
|
|
778
786
|
prefix='v4',
|
|
@@ -783,6 +791,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
783
791
|
supported_topologies=generate_tpu_topologies(
|
|
784
792
|
max_cubes=64, enforce_nondecreasing=False
|
|
785
793
|
),
|
|
794
|
+
supports_accelerator_network_profile=False,
|
|
786
795
|
default_topologies=set([
|
|
787
796
|
'2x2x1',
|
|
788
797
|
'2x2x2',
|
|
@@ -811,6 +820,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
811
820
|
device_type='m1-megamem-96-1',
|
|
812
821
|
supports_sub_slicing=False,
|
|
813
822
|
supports_super_slicing=False,
|
|
823
|
+
supports_accelerator_network_profile=False,
|
|
814
824
|
docker_platform=AMD_PLATFORM,
|
|
815
825
|
),
|
|
816
826
|
# n2-standard-#vCPUs-#VMs
|
|
@@ -824,6 +834,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
824
834
|
device_type='n2-standard-64-1',
|
|
825
835
|
supports_sub_slicing=False,
|
|
826
836
|
supports_super_slicing=False,
|
|
837
|
+
supports_accelerator_network_profile=False,
|
|
827
838
|
docker_platform=AMD_PLATFORM,
|
|
828
839
|
),
|
|
829
840
|
'n2-standard-32-1': SystemCharacteristics(
|
|
@@ -836,6 +847,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
836
847
|
device_type='n2-standard-32-1',
|
|
837
848
|
supports_sub_slicing=False,
|
|
838
849
|
supports_super_slicing=False,
|
|
850
|
+
supports_accelerator_network_profile=False,
|
|
839
851
|
docker_platform=AMD_PLATFORM,
|
|
840
852
|
),
|
|
841
853
|
'n2-standard-32-2': SystemCharacteristics(
|
|
@@ -848,6 +860,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
848
860
|
device_type='n2-standard-32-2',
|
|
849
861
|
supports_sub_slicing=False,
|
|
850
862
|
supports_super_slicing=False,
|
|
863
|
+
supports_accelerator_network_profile=False,
|
|
851
864
|
docker_platform=AMD_PLATFORM,
|
|
852
865
|
),
|
|
853
866
|
'n2-standard-32-4': SystemCharacteristics(
|
|
@@ -860,6 +873,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
860
873
|
device_type='n2-standard-32-4',
|
|
861
874
|
supports_sub_slicing=False,
|
|
862
875
|
supports_super_slicing=False,
|
|
876
|
+
supports_accelerator_network_profile=False,
|
|
863
877
|
docker_platform=AMD_PLATFORM,
|
|
864
878
|
),
|
|
865
879
|
'n2-standard-32-8': SystemCharacteristics(
|
|
@@ -872,6 +886,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
872
886
|
device_type='n2-standard-32-8',
|
|
873
887
|
supports_sub_slicing=False,
|
|
874
888
|
supports_super_slicing=False,
|
|
889
|
+
supports_accelerator_network_profile=False,
|
|
875
890
|
docker_platform=AMD_PLATFORM,
|
|
876
891
|
),
|
|
877
892
|
'n2-standard-32-16': SystemCharacteristics(
|
|
@@ -884,6 +899,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
884
899
|
device_type='n2-standard-32-16',
|
|
885
900
|
supports_sub_slicing=False,
|
|
886
901
|
supports_super_slicing=False,
|
|
902
|
+
supports_accelerator_network_profile=False,
|
|
887
903
|
docker_platform=AMD_PLATFORM,
|
|
888
904
|
),
|
|
889
905
|
'n2-standard-32-32': SystemCharacteristics(
|
|
@@ -896,6 +912,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
896
912
|
device_type='n2-standard-32-32',
|
|
897
913
|
supports_sub_slicing=False,
|
|
898
914
|
supports_super_slicing=False,
|
|
915
|
+
supports_accelerator_network_profile=False,
|
|
899
916
|
docker_platform=AMD_PLATFORM,
|
|
900
917
|
),
|
|
901
918
|
'n2-standard-32-64': SystemCharacteristics(
|
|
@@ -908,6 +925,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
908
925
|
device_type='n2-standard-32-64',
|
|
909
926
|
supports_sub_slicing=False,
|
|
910
927
|
supports_super_slicing=False,
|
|
928
|
+
supports_accelerator_network_profile=False,
|
|
911
929
|
docker_platform=AMD_PLATFORM,
|
|
912
930
|
),
|
|
913
931
|
'n2-standard-32-128': SystemCharacteristics(
|
|
@@ -920,6 +938,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
920
938
|
device_type='n2-standard-32-128',
|
|
921
939
|
supports_sub_slicing=False,
|
|
922
940
|
supports_super_slicing=False,
|
|
941
|
+
supports_accelerator_network_profile=False,
|
|
923
942
|
docker_platform=AMD_PLATFORM,
|
|
924
943
|
),
|
|
925
944
|
'n2-standard-32-256': SystemCharacteristics(
|
|
@@ -932,6 +951,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
932
951
|
device_type='n2-standard-32-256',
|
|
933
952
|
supports_sub_slicing=False,
|
|
934
953
|
supports_super_slicing=False,
|
|
954
|
+
supports_accelerator_network_profile=False,
|
|
935
955
|
docker_platform=AMD_PLATFORM,
|
|
936
956
|
),
|
|
937
957
|
'n2-standard-32-512': SystemCharacteristics(
|
|
@@ -944,6 +964,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
944
964
|
device_type='n2-standard-32-512',
|
|
945
965
|
supports_sub_slicing=False,
|
|
946
966
|
supports_super_slicing=False,
|
|
967
|
+
supports_accelerator_network_profile=False,
|
|
947
968
|
docker_platform=AMD_PLATFORM,
|
|
948
969
|
),
|
|
949
970
|
'n2-standard-32-1024': SystemCharacteristics(
|
|
@@ -956,6 +977,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
956
977
|
device_type='n2-standard-32-1024',
|
|
957
978
|
supports_sub_slicing=False,
|
|
958
979
|
supports_super_slicing=False,
|
|
980
|
+
supports_accelerator_network_profile=False,
|
|
959
981
|
docker_platform=AMD_PLATFORM,
|
|
960
982
|
),
|
|
961
983
|
'n2-standard-32-2048': SystemCharacteristics(
|
|
@@ -968,6 +990,7 @@ UserFacingNameToSystemCharacteristics = {
|
|
|
968
990
|
device_type='n2-standard-32-2048',
|
|
969
991
|
supports_sub_slicing=False,
|
|
970
992
|
supports_super_slicing=False,
|
|
993
|
+
supports_accelerator_network_profile=False,
|
|
971
994
|
docker_platform=AMD_PLATFORM,
|
|
972
995
|
),
|
|
973
996
|
}
|
|
@@ -34,6 +34,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
34
34
|
supported_topologies=["1x1"],
|
|
35
35
|
docker_platform=DockerPlatform.AMD,
|
|
36
36
|
tpu_type_requires_workload_policy=False,
|
|
37
|
+
supports_accelerator_network_profile=False,
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -48,6 +49,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
|
|
|
48
49
|
supports_super_slicing=False,
|
|
49
50
|
docker_platform=DockerPlatform.AMD,
|
|
50
51
|
requires_workload_policy=False,
|
|
52
|
+
supports_accelerator_network_profile=False,
|
|
51
53
|
)
|
|
52
54
|
assert result == {
|
|
53
55
|
"test-1": expected_system_characteristics,
|
|
@@ -62,6 +64,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
62
64
|
gke_accelerator="test",
|
|
63
65
|
machine_type="test",
|
|
64
66
|
supported_topologies=["2x2"],
|
|
67
|
+
supports_accelerator_network_profile=False,
|
|
65
68
|
docker_platform=DockerPlatform.AMD,
|
|
66
69
|
tpu_type_requires_workload_policy=True,
|
|
67
70
|
)
|
|
@@ -76,6 +79,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
|
|
|
76
79
|
device_type="test-8",
|
|
77
80
|
supports_sub_slicing=False,
|
|
78
81
|
supports_super_slicing=False,
|
|
82
|
+
supports_accelerator_network_profile=False,
|
|
79
83
|
docker_platform=DockerPlatform.AMD,
|
|
80
84
|
requires_workload_policy=False,
|
|
81
85
|
)
|
|
@@ -94,6 +98,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
94
98
|
supported_topologies=["2x2x2"],
|
|
95
99
|
docker_platform=DockerPlatform.AMD,
|
|
96
100
|
tpu_type_requires_workload_policy=True,
|
|
101
|
+
supports_accelerator_network_profile=False,
|
|
97
102
|
)
|
|
98
103
|
|
|
99
104
|
expected_system_characteristics = SystemCharacteristics(
|
|
@@ -106,6 +111,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
|
|
|
106
111
|
device_type="test-16",
|
|
107
112
|
supports_sub_slicing=False,
|
|
108
113
|
supports_super_slicing=False,
|
|
114
|
+
supports_accelerator_network_profile=False,
|
|
109
115
|
docker_platform=DockerPlatform.AMD,
|
|
110
116
|
requires_workload_policy=True,
|
|
111
117
|
)
|
|
@@ -122,6 +128,7 @@ def test_get_tpu_system_characteristics_map_sets_sub_slicing_support():
|
|
|
122
128
|
gke_accelerator="test",
|
|
123
129
|
machine_type="test",
|
|
124
130
|
supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
|
|
131
|
+
supports_accelerator_network_profile=False,
|
|
125
132
|
docker_platform=DockerPlatform.AMD,
|
|
126
133
|
sub_slicing_topologies=set(["4x4x8", "4x4x16"]),
|
|
127
134
|
)
|
|
@@ -138,6 +145,7 @@ def test_get_tpu_system_characteristics_map_sets_super_slicing_support():
|
|
|
138
145
|
gke_accelerator="test",
|
|
139
146
|
machine_type="test",
|
|
140
147
|
supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
|
|
148
|
+
supports_accelerator_network_profile=False,
|
|
141
149
|
docker_platform=DockerPlatform.AMD,
|
|
142
150
|
super_slicing_topologies=set(["4x4x8", "4x4x16"]),
|
|
143
151
|
)
|
|
@@ -154,6 +162,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
|
|
|
154
162
|
gke_accelerator="test",
|
|
155
163
|
machine_type="test",
|
|
156
164
|
supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
|
|
165
|
+
supports_accelerator_network_profile=False,
|
|
157
166
|
docker_platform=DockerPlatform.AMD,
|
|
158
167
|
default_topologies=set(["4x8x16"]),
|
|
159
168
|
)
|
|
@@ -206,6 +215,7 @@ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
|
|
|
206
215
|
device_type="l4-1",
|
|
207
216
|
supports_sub_slicing=False,
|
|
208
217
|
supports_super_slicing=False,
|
|
218
|
+
supports_accelerator_network_profile=False,
|
|
209
219
|
docker_platform=DockerPlatform.AMD,
|
|
210
220
|
gpu_config=GpuConfig(requires_topology=False),
|
|
211
221
|
)
|
|
@@ -225,5 +235,6 @@ def test_system_characteristics_post_init_throws_for_gpu_without_config():
|
|
|
225
235
|
device_type="l4-1",
|
|
226
236
|
supports_sub_slicing=False,
|
|
227
237
|
supports_super_slicing=False,
|
|
238
|
+
supports_accelerator_network_profile=False,
|
|
228
239
|
docker_platform=DockerPlatform.AMD,
|
|
229
240
|
)
|
|
@@ -18,21 +18,6 @@ import yaml
|
|
|
18
18
|
from ...utils.yaml import literal_string
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
22
|
-
spec = (
|
|
23
|
-
job_manifest.setdefault('spec', {})
|
|
24
|
-
.setdefault('template', {})
|
|
25
|
-
.setdefault('spec', {})
|
|
26
|
-
)
|
|
27
|
-
spec.setdefault('tolerations', [])
|
|
28
|
-
spec.setdefault('volumes', [])
|
|
29
|
-
|
|
30
|
-
add_volumes(job_manifest)
|
|
31
|
-
add_tolerations(job_manifest)
|
|
32
|
-
update_gpu_containers(job_manifest)
|
|
33
|
-
return job_manifest
|
|
34
|
-
|
|
35
|
-
|
|
36
21
|
def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
|
|
37
22
|
"""
|
|
38
23
|
Decorates a JobSet manifest with the necessary components for rdma-daemon.
|
|
@@ -22,14 +22,6 @@ from ...utils.yaml import literal_string
|
|
|
22
22
|
tcpx = 'v2.0.11'
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
-
add_volumes(job_manifest)
|
|
27
|
-
add_tolerations(job_manifest)
|
|
28
|
-
add_tcpx_daemon_container(job_manifest)
|
|
29
|
-
update_gpu_containers(job_manifest)
|
|
30
|
-
return job_manifest
|
|
31
|
-
|
|
32
|
-
|
|
33
25
|
def decorate_job(job_manifest: dict) -> dict:
|
|
34
26
|
add_annotations(job_manifest)
|
|
35
27
|
add_volumes(job_manifest)
|
|
@@ -47,24 +47,6 @@ spec:
|
|
|
47
47
|
image: my-sidecar-image
|
|
48
48
|
"""
|
|
49
49
|
|
|
50
|
-
# Minimal kjob template for testing
|
|
51
|
-
BASE_KJOB_TEMPLATE = {
|
|
52
|
-
"spec": {
|
|
53
|
-
"template": {
|
|
54
|
-
"spec": {
|
|
55
|
-
"containers": [
|
|
56
|
-
{
|
|
57
|
-
"name": "main-gpu-container",
|
|
58
|
-
"image": "my-gpu-image",
|
|
59
|
-
"resources": {"limits": {"nvidia.com/gpu": 8}},
|
|
60
|
-
},
|
|
61
|
-
{"name": "sidecar-container", "image": "my-sidecar-image"},
|
|
62
|
-
]
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
50
|
# Minimal job manifest for testing
|
|
69
51
|
BASE_JOB_MANIFEST = {
|
|
70
52
|
"spec": {
|
|
@@ -205,63 +187,3 @@ def test_decorate_job():
|
|
|
205
187
|
assert "devices.gke.io/container.tcpx-daemon" in annotations
|
|
206
188
|
assert "networking.gke.io/default-interface" in annotations
|
|
207
189
|
assert "networking.gke.io/interfaces" in annotations
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def test_decorate_kjob_template():
|
|
211
|
-
"""Tests decorate_kjob_template."""
|
|
212
|
-
kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
|
|
213
|
-
|
|
214
|
-
decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
|
|
215
|
-
|
|
216
|
-
pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
|
|
217
|
-
|
|
218
|
-
# Check annotations are NOT added
|
|
219
|
-
assert "annotations" not in decorated_manifest["spec"]["template"].get(
|
|
220
|
-
"metadata", {}
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
# Check tolerations
|
|
224
|
-
tolerations = pod_template_spec["tolerations"]
|
|
225
|
-
assert {
|
|
226
|
-
"key": "user-workload",
|
|
227
|
-
"operator": "Equal",
|
|
228
|
-
"value": "true",
|
|
229
|
-
"effect": "NoSchedule",
|
|
230
|
-
} in tolerations
|
|
231
|
-
|
|
232
|
-
# Check volumes
|
|
233
|
-
volumes = pod_template_spec["volumes"]
|
|
234
|
-
volume_names = {v["name"] for v in volumes}
|
|
235
|
-
assert "libraries" in volume_names
|
|
236
|
-
assert "sys" in volume_names
|
|
237
|
-
assert "proc-sys" in volume_names
|
|
238
|
-
assert "tcpx-socket" in volume_names
|
|
239
|
-
assert "dshm" in volume_names
|
|
240
|
-
|
|
241
|
-
# Check init container
|
|
242
|
-
init_containers = pod_template_spec["initContainers"]
|
|
243
|
-
assert len(init_containers) == 1
|
|
244
|
-
tcpx_daemon = init_containers[0]
|
|
245
|
-
assert tcpx_daemon["name"] == "tcpx-daemon"
|
|
246
|
-
assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
|
|
247
|
-
|
|
248
|
-
# Check GPU container update
|
|
249
|
-
gpu_container = pod_template_spec["containers"][0]
|
|
250
|
-
assert gpu_container["name"] == "main-gpu-container"
|
|
251
|
-
|
|
252
|
-
# Check env
|
|
253
|
-
env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
|
|
254
|
-
assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
|
|
255
|
-
|
|
256
|
-
# Check volume mounts
|
|
257
|
-
volume_mounts = {
|
|
258
|
-
vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
|
|
259
|
-
}
|
|
260
|
-
assert volume_mounts["tcpx-socket"] == "/tmp"
|
|
261
|
-
assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
|
|
262
|
-
assert volume_mounts["dshm"] == "/dev/shm"
|
|
263
|
-
|
|
264
|
-
# Check non-GPU container is not updated
|
|
265
|
-
sidecar_container = pod_template_spec["containers"][1]
|
|
266
|
-
assert "env" not in sidecar_container
|
|
267
|
-
assert "volumeMounts" not in sidecar_container
|
|
@@ -22,22 +22,6 @@ from ...utils.yaml import literal_string
|
|
|
22
22
|
rxdm = 'v1.0.12'
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def decorate_kjob_template(job_manifest: dict) -> dict:
|
|
26
|
-
spec = (
|
|
27
|
-
job_manifest.setdefault('spec', {})
|
|
28
|
-
.setdefault('template', {})
|
|
29
|
-
.setdefault('spec', {})
|
|
30
|
-
)
|
|
31
|
-
spec.setdefault('tolerations', [])
|
|
32
|
-
spec.setdefault('volumes', [])
|
|
33
|
-
|
|
34
|
-
add_volumes(job_manifest)
|
|
35
|
-
add_tolerations(job_manifest)
|
|
36
|
-
add_tcpxo_daemon_container(job_manifest)
|
|
37
|
-
update_gpu_containers(job_manifest)
|
|
38
|
-
return job_manifest
|
|
39
|
-
|
|
40
|
-
|
|
41
25
|
def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
|
|
42
26
|
job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
|
|
43
27
|
'metadata', {}
|
xpk/parser/common.py
CHANGED
|
@@ -144,23 +144,6 @@ def add_cluster_arguments(
|
|
|
144
144
|
)
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def add_kind_cluster_arguments(
|
|
148
|
-
custom_parser_or_group: ParserOrArgumentGroup,
|
|
149
|
-
) -> None:
|
|
150
|
-
"""Add kind cluster arguments to the parser or argument group.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
custom_parser_or_group: parser or argument group to add shared arguments to.
|
|
154
|
-
"""
|
|
155
|
-
custom_parser_or_group.add_argument(
|
|
156
|
-
'--kind-cluster',
|
|
157
|
-
type=bool,
|
|
158
|
-
action=argparse.BooleanOptionalAction,
|
|
159
|
-
default=False,
|
|
160
|
-
help='Apply command to a local test cluster.',
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
|
|
164
147
|
def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
|
|
165
148
|
"""Add global - no cloud dependent - arguments to the parser.
|
|
166
149
|
|
xpk/parser/core.py
CHANGED
|
@@ -23,13 +23,8 @@ from .cluster import set_cluster_parser
|
|
|
23
23
|
from .inspector import set_inspector_parser
|
|
24
24
|
from .storage import set_storage_parser
|
|
25
25
|
from .workload import set_workload_parsers
|
|
26
|
-
from .batch import set_batch_parser
|
|
27
|
-
from .job import set_job_parser
|
|
28
26
|
from .info import set_info_parser
|
|
29
|
-
from .kind import set_kind_parser
|
|
30
|
-
from .shell import set_shell_parser
|
|
31
27
|
from .version import set_version_parser
|
|
32
|
-
from .run import set_run_parser
|
|
33
28
|
|
|
34
29
|
|
|
35
30
|
def set_parser(parser: argparse.ArgumentParser):
|
|
@@ -54,20 +49,6 @@ def set_parser(parser: argparse.ArgumentParser):
|
|
|
54
49
|
"info",
|
|
55
50
|
help="Commands around listing kueue clusterqueues and localqueues.",
|
|
56
51
|
)
|
|
57
|
-
batch_parser = xpk_subcommands.add_parser(
|
|
58
|
-
"batch",
|
|
59
|
-
help="commands around running batch job",
|
|
60
|
-
)
|
|
61
|
-
job_parser = xpk_subcommands.add_parser(
|
|
62
|
-
"job", help="commands around listing, cancelling and investigating jobs"
|
|
63
|
-
)
|
|
64
|
-
kind_parser = xpk_subcommands.add_parser(
|
|
65
|
-
"kind",
|
|
66
|
-
help="commands around Kind cluster management",
|
|
67
|
-
)
|
|
68
|
-
shell_parser = xpk_subcommands.add_parser(
|
|
69
|
-
"shell", help="Commands around configuring and using interactive shell."
|
|
70
|
-
)
|
|
71
52
|
version_parser = xpk_subcommands.add_parser(
|
|
72
53
|
"version", help="Command to get xpk version"
|
|
73
54
|
)
|
|
@@ -76,11 +57,6 @@ def set_parser(parser: argparse.ArgumentParser):
|
|
|
76
57
|
"config", help="Commands to set and retrieve values from xpk config."
|
|
77
58
|
)
|
|
78
59
|
|
|
79
|
-
run_parser = xpk_subcommands.add_parser(
|
|
80
|
-
"run",
|
|
81
|
-
help="Command to run parallel jobs",
|
|
82
|
-
)
|
|
83
|
-
|
|
84
60
|
def default_subcommand_function(
|
|
85
61
|
_args,
|
|
86
62
|
) -> int: # args is unused, so pylint: disable=invalid-name
|
|
@@ -96,14 +72,9 @@ def set_parser(parser: argparse.ArgumentParser):
|
|
|
96
72
|
parser.print_help()
|
|
97
73
|
cluster_parser.print_help()
|
|
98
74
|
workload_parser.print_help()
|
|
99
|
-
batch_parser.print_help()
|
|
100
75
|
info_parser.print_help()
|
|
101
|
-
job_parser.print_help()
|
|
102
|
-
shell_parser.print_help()
|
|
103
76
|
version_parser.print_help()
|
|
104
|
-
kind_parser.print_help()
|
|
105
77
|
config_parser.print_help()
|
|
106
|
-
run_parser.print_help()
|
|
107
78
|
|
|
108
79
|
storage_parser.print_help()
|
|
109
80
|
return 0
|
|
@@ -111,25 +82,15 @@ def set_parser(parser: argparse.ArgumentParser):
|
|
|
111
82
|
parser.set_defaults(func=default_subcommand_function)
|
|
112
83
|
workload_parser.set_defaults(func=default_subcommand_function)
|
|
113
84
|
cluster_parser.set_defaults(func=default_subcommand_function)
|
|
114
|
-
batch_parser.set_defaults(func=default_subcommand_function)
|
|
115
85
|
info_parser.set_defaults(func=default_subcommand_function)
|
|
116
|
-
job_parser.set_defaults(func=default_subcommand_function)
|
|
117
|
-
kind_parser.set_defaults(func=default_subcommand_function)
|
|
118
|
-
shell_parser.set_defaults(func=default_subcommand_function)
|
|
119
86
|
storage_parser.set_defaults(func=default_subcommand_function)
|
|
120
87
|
version_parser.set_defaults(func=default_subcommand_function)
|
|
121
88
|
config_parser.set_defaults(func=default_subcommand_function)
|
|
122
|
-
run_parser.set_defaults(func=default_subcommand_function)
|
|
123
89
|
|
|
124
90
|
set_workload_parsers(workload_parser=workload_parser)
|
|
125
91
|
set_cluster_parser(cluster_parser=cluster_parser)
|
|
126
92
|
set_inspector_parser(inspector_parser=inspector_parser)
|
|
127
|
-
set_batch_parser(batch_parser=batch_parser)
|
|
128
93
|
set_info_parser(info_parser=info_parser)
|
|
129
|
-
set_job_parser(job_parser=job_parser)
|
|
130
|
-
set_kind_parser(kind_parser=kind_parser)
|
|
131
|
-
set_shell_parser(shell_parser=shell_parser)
|
|
132
94
|
set_storage_parser(storage_parser=storage_parser)
|
|
133
95
|
set_version_parser(version_parser=version_parser)
|
|
134
96
|
set_config_parsers(config_parser=config_parser)
|
|
135
|
-
set_run_parser(run_parser=run_parser)
|