xpk 0.17.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. xpk/commands/cluster.py +4 -35
  2. xpk/commands/cluster_gcluster.py +1 -13
  3. xpk/commands/cluster_gcluster_test.py +2 -10
  4. xpk/commands/cluster_test.py +0 -4
  5. xpk/commands/workload.py +10 -3
  6. xpk/commands/workload_test.py +1 -0
  7. xpk/core/cluster.py +10 -9
  8. xpk/core/config.py +5 -17
  9. xpk/core/kueue_manager_test.py +2 -0
  10. xpk/core/nodepool.py +6 -0
  11. xpk/core/nodepool_test.py +4 -0
  12. xpk/core/scheduling.py +28 -3
  13. xpk/core/scheduling_test.py +38 -1
  14. xpk/core/system_characteristics.py +39 -16
  15. xpk/core/system_characteristics_test.py +11 -0
  16. xpk/core/workload_decorators/rdma_decorator.py +0 -15
  17. xpk/core/workload_decorators/tcpx_decorator.py +0 -8
  18. xpk/core/workload_decorators/tcpx_decorator_test.py +0 -78
  19. xpk/core/workload_decorators/tcpxo_decorator.py +0 -16
  20. xpk/parser/common.py +0 -17
  21. xpk/parser/core.py +0 -39
  22. xpk/parser/storage.py +0 -11
  23. xpk/utils/feature_flags.py +1 -1
  24. xpk/utils/validation.py +0 -8
  25. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/METADATA +15 -4
  26. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/RECORD +30 -41
  27. xpk/commands/batch.py +0 -144
  28. xpk/commands/job.py +0 -244
  29. xpk/commands/kind.py +0 -286
  30. xpk/commands/kjob_common.py +0 -60
  31. xpk/commands/run.py +0 -140
  32. xpk/commands/shell.py +0 -142
  33. xpk/parser/batch.py +0 -43
  34. xpk/parser/job.py +0 -147
  35. xpk/parser/kind.py +0 -95
  36. xpk/parser/run.py +0 -47
  37. xpk/parser/shell.py +0 -59
  38. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/WHEEL +0 -0
  39. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/entry_points.txt +0 -0
  40. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/licenses/LICENSE +0 -0
  41. {xpk-0.17.2.dist-info → xpk-1.0.0.dist-info}/top_level.txt +0 -0
@@ -80,15 +80,6 @@ class GpuConfig:
80
80
 
81
81
  requires_topology: bool
82
82
  gpu_direct_name: Literal['fastrak', 'rdma', 'tcpx', 'tcpxo'] = 'fastrak'
83
- kjob_decorator_fn: Optional[Callable[[dict], dict]] = None
84
- """A function to decorate the kjob template for GPU-specific configurations.
85
-
86
- Args:
87
- job_manifest (dict): The kjob manifest as a dictionary.
88
-
89
- Returns:
90
- dict: The modified kjob manifest as a dictionary.
91
- """
92
83
  nccl_installer: Optional[str] = None
93
84
  jobset_decorator_fn: Optional[Callable[[str, list[str]], str]] = None
94
85
  """A function to decorate the jobset for GPU-specific configurations.
@@ -106,7 +97,7 @@ class GpuConfig:
106
97
  parts = []
107
98
  for f in dataclasses.fields(self):
108
99
  value = getattr(self, f.name)
109
- if f.name in ('kjob_decorator_fn', 'jobset_decorator_fn') and value:
100
+ if f.name in ('jobset_decorator_fn') and value:
110
101
  parts.append(f'{f.name}=<function {value.__name__}>')
111
102
  else:
112
103
  parts.append(f'{f.name}={repr(value)}')
@@ -151,6 +142,7 @@ class SystemCharacteristics:
151
142
  device_type: str
152
143
  supports_sub_slicing: bool
153
144
  supports_super_slicing: bool
145
+ supports_accelerator_network_profile: bool
154
146
  docker_platform: DockerPlatform
155
147
  requires_workload_policy: bool = False
156
148
  gpu_config: Optional[GpuConfig] = None
@@ -242,6 +234,7 @@ def get_tpu_system_characteristics_map(
242
234
  machine_type: str,
243
235
  supported_topologies: list[str],
244
236
  docker_platform: DockerPlatform,
237
+ supports_accelerator_network_profile: bool,
245
238
  tpu_type_requires_workload_policy: bool = False,
246
239
  default_topologies: set[str] | None = None,
247
240
  sub_slicing_topologies: set[str] | None = None,
@@ -268,6 +261,7 @@ def get_tpu_system_characteristics_map(
268
261
  and vms_per_slice > 1,
269
262
  supports_sub_slicing=topology in sub_slicing_topologies,
270
263
  supports_super_slicing=topology in super_slicing_topologies,
264
+ supports_accelerator_network_profile=supports_accelerator_network_profile,
271
265
  docker_platform=docker_platform,
272
266
  )
273
267
  system_characteristics_map[f'{prefix}-{topology}'] = system
@@ -312,6 +306,7 @@ UserFacingNameToSystemCharacteristics = {
312
306
  device_type='l4-1',
313
307
  supports_sub_slicing=False,
314
308
  supports_super_slicing=False,
309
+ supports_accelerator_network_profile=False,
315
310
  gpu_config=GpuConfig(requires_topology=False),
316
311
  docker_platform=AMD_PLATFORM,
317
312
  ),
@@ -325,6 +320,7 @@ UserFacingNameToSystemCharacteristics = {
325
320
  device_type='l4-2',
326
321
  supports_sub_slicing=False,
327
322
  supports_super_slicing=False,
323
+ supports_accelerator_network_profile=False,
328
324
  gpu_config=GpuConfig(requires_topology=False),
329
325
  docker_platform=AMD_PLATFORM,
330
326
  ),
@@ -338,6 +334,7 @@ UserFacingNameToSystemCharacteristics = {
338
334
  device_type='l4-4',
339
335
  supports_sub_slicing=False,
340
336
  supports_super_slicing=False,
337
+ supports_accelerator_network_profile=False,
341
338
  gpu_config=GpuConfig(requires_topology=False),
342
339
  docker_platform=AMD_PLATFORM,
343
340
  ),
@@ -351,6 +348,7 @@ UserFacingNameToSystemCharacteristics = {
351
348
  device_type='l4-8',
352
349
  supports_sub_slicing=False,
353
350
  supports_super_slicing=False,
351
+ supports_accelerator_network_profile=False,
354
352
  gpu_config=GpuConfig(requires_topology=False),
355
353
  docker_platform=AMD_PLATFORM,
356
354
  ),
@@ -365,6 +363,7 @@ UserFacingNameToSystemCharacteristics = {
365
363
  device_type='a100-40gb-1',
366
364
  supports_sub_slicing=False,
367
365
  supports_super_slicing=False,
366
+ supports_accelerator_network_profile=False,
368
367
  gpu_config=GpuConfig(requires_topology=False),
369
368
  docker_platform=AMD_PLATFORM,
370
369
  ),
@@ -378,6 +377,7 @@ UserFacingNameToSystemCharacteristics = {
378
377
  device_type='a100-40gb-2',
379
378
  supports_sub_slicing=False,
380
379
  supports_super_slicing=False,
380
+ supports_accelerator_network_profile=False,
381
381
  gpu_config=GpuConfig(requires_topology=False),
382
382
  docker_platform=AMD_PLATFORM,
383
383
  ),
@@ -391,6 +391,7 @@ UserFacingNameToSystemCharacteristics = {
391
391
  device_type='a100-40gb-4',
392
392
  supports_sub_slicing=False,
393
393
  supports_super_slicing=False,
394
+ supports_accelerator_network_profile=False,
394
395
  gpu_config=GpuConfig(requires_topology=False),
395
396
  docker_platform=AMD_PLATFORM,
396
397
  ),
@@ -404,6 +405,7 @@ UserFacingNameToSystemCharacteristics = {
404
405
  device_type='a100-40gb-8',
405
406
  supports_sub_slicing=False,
406
407
  supports_super_slicing=False,
408
+ supports_accelerator_network_profile=False,
407
409
  gpu_config=GpuConfig(requires_topology=False),
408
410
  docker_platform=AMD_PLATFORM,
409
411
  ),
@@ -417,10 +419,10 @@ UserFacingNameToSystemCharacteristics = {
417
419
  device_type='gb200-4',
418
420
  supports_sub_slicing=False,
419
421
  supports_super_slicing=False,
422
+ supports_accelerator_network_profile=True,
420
423
  gpu_config=GpuConfig(
421
424
  requires_topology=True,
422
425
  nccl_installer=INSTALLER_NCCL_RDMA_A4X,
423
- kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
424
426
  jobset_decorator_fn=rdma_decorator.decorate_jobset,
425
427
  gpu_direct_name='rdma',
426
428
  ),
@@ -436,10 +438,10 @@ UserFacingNameToSystemCharacteristics = {
436
438
  device_type='gb200-4',
437
439
  supports_sub_slicing=False,
438
440
  supports_super_slicing=False,
441
+ supports_accelerator_network_profile=True,
439
442
  gpu_config=GpuConfig(
440
443
  requires_topology=True,
441
444
  nccl_installer=INSTALLER_NCCL_RDMA_A4X,
442
- kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
443
445
  jobset_decorator_fn=rdma_decorator.decorate_jobset,
444
446
  gpu_direct_name='rdma',
445
447
  ),
@@ -455,10 +457,10 @@ UserFacingNameToSystemCharacteristics = {
455
457
  device_type='b200-8',
456
458
  supports_sub_slicing=False,
457
459
  supports_super_slicing=False,
460
+ supports_accelerator_network_profile=True,
458
461
  gpu_config=GpuConfig(
459
462
  requires_topology=True,
460
463
  nccl_installer=INSTALLER_NCCL_RDMA,
461
- kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
462
464
  jobset_decorator_fn=rdma_decorator.decorate_jobset,
463
465
  gpu_direct_name='rdma',
464
466
  ),
@@ -474,10 +476,10 @@ UserFacingNameToSystemCharacteristics = {
474
476
  device_type='h200-141gb-8',
475
477
  supports_sub_slicing=False,
476
478
  supports_super_slicing=False,
479
+ supports_accelerator_network_profile=True,
477
480
  gpu_config=GpuConfig(
478
481
  requires_topology=True,
479
482
  nccl_installer=INSTALLER_NCCL_RDMA,
480
- kjob_decorator_fn=rdma_decorator.decorate_kjob_template,
481
483
  jobset_decorator_fn=rdma_decorator.decorate_jobset,
482
484
  gpu_direct_name='rdma',
483
485
  ),
@@ -494,10 +496,10 @@ UserFacingNameToSystemCharacteristics = {
494
496
  device_type='h100-80gb-8',
495
497
  supports_sub_slicing=False,
496
498
  supports_super_slicing=False,
499
+ supports_accelerator_network_profile=True,
497
500
  gpu_config=GpuConfig(
498
501
  requires_topology=True,
499
502
  nccl_installer=INSTALLER_NCCL_TCPX,
500
- kjob_decorator_fn=tcpx_decorator.decorate_kjob_template,
501
503
  jobset_decorator_fn=tcpx_decorator.decorate_jobset,
502
504
  gpu_direct_name='tcpx',
503
505
  ),
@@ -514,10 +516,10 @@ UserFacingNameToSystemCharacteristics = {
514
516
  device_type='h100-mega-80gb-8',
515
517
  supports_sub_slicing=False,
516
518
  supports_super_slicing=False,
519
+ supports_accelerator_network_profile=True,
517
520
  gpu_config=GpuConfig(
518
521
  requires_topology=True,
519
522
  nccl_installer=INSTALLER_NCCL_TCPXO,
520
- kjob_decorator_fn=tcpxo_decorator.decorate_kjob_template,
521
523
  jobset_decorator_fn=tcpxo_decorator.decorate_jobset,
522
524
  gpu_direct_name='tcpxo',
523
525
  ),
@@ -531,6 +533,7 @@ UserFacingNameToSystemCharacteristics = {
531
533
  machine_type='tpu7x-standard-1t',
532
534
  supported_topologies=['1x1x1'],
533
535
  tpu_type_requires_workload_policy=True,
536
+ supports_accelerator_network_profile=False,
534
537
  docker_platform=AMD_PLATFORM,
535
538
  ),
536
539
  **get_tpu_system_characteristics_map(
@@ -539,6 +542,7 @@ UserFacingNameToSystemCharacteristics = {
539
542
  gke_accelerator='tpu7x',
540
543
  machine_type='tpu7x-standard-4t',
541
544
  tpu_type_requires_workload_policy=True,
545
+ supports_accelerator_network_profile=False,
542
546
  docker_platform=AMD_PLATFORM,
543
547
  supported_topologies=generate_tpu_topologies(max_cubes=144),
544
548
  super_slicing_topologies=set(['4x4x4']),
@@ -650,6 +654,7 @@ UserFacingNameToSystemCharacteristics = {
650
654
  machine_type='ct6e-standard-1t',
651
655
  supported_topologies=['1x1'],
652
656
  docker_platform=AMD_PLATFORM,
657
+ supports_accelerator_network_profile=True,
653
658
  ),
654
659
  **get_tpu_system_characteristics_map(
655
660
  prefix='v6e',
@@ -659,6 +664,7 @@ UserFacingNameToSystemCharacteristics = {
659
664
  supported_topologies=['2x2'] + SUB_SLICING_TOPOLOGIES,
660
665
  sub_slicing_topologies=set(SUB_SLICING_TOPOLOGIES),
661
666
  docker_platform=AMD_PLATFORM,
667
+ supports_accelerator_network_profile=True,
662
668
  ),
663
669
  **get_tpu_system_characteristics_map(
664
670
  prefix='v5p',
@@ -667,6 +673,7 @@ UserFacingNameToSystemCharacteristics = {
667
673
  machine_type='ct5p-hightpu-4t',
668
674
  docker_platform=AMD_PLATFORM,
669
675
  supported_topologies=generate_tpu_topologies(max_cubes=140),
676
+ supports_accelerator_network_profile=False,
670
677
  default_topologies=set([
671
678
  '2x2x1',
672
679
  '2x2x2',
@@ -773,6 +780,7 @@ UserFacingNameToSystemCharacteristics = {
773
780
  machine_type='ct5lp-hightpu-4t',
774
781
  docker_platform=AMD_PLATFORM,
775
782
  supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
783
+ supports_accelerator_network_profile=False,
776
784
  ),
777
785
  **get_tpu_system_characteristics_map(
778
786
  prefix='v4',
@@ -783,6 +791,7 @@ UserFacingNameToSystemCharacteristics = {
783
791
  supported_topologies=generate_tpu_topologies(
784
792
  max_cubes=64, enforce_nondecreasing=False
785
793
  ),
794
+ supports_accelerator_network_profile=False,
786
795
  default_topologies=set([
787
796
  '2x2x1',
788
797
  '2x2x2',
@@ -811,6 +820,7 @@ UserFacingNameToSystemCharacteristics = {
811
820
  device_type='m1-megamem-96-1',
812
821
  supports_sub_slicing=False,
813
822
  supports_super_slicing=False,
823
+ supports_accelerator_network_profile=False,
814
824
  docker_platform=AMD_PLATFORM,
815
825
  ),
816
826
  # n2-standard-#vCPUs-#VMs
@@ -824,6 +834,7 @@ UserFacingNameToSystemCharacteristics = {
824
834
  device_type='n2-standard-64-1',
825
835
  supports_sub_slicing=False,
826
836
  supports_super_slicing=False,
837
+ supports_accelerator_network_profile=False,
827
838
  docker_platform=AMD_PLATFORM,
828
839
  ),
829
840
  'n2-standard-32-1': SystemCharacteristics(
@@ -836,6 +847,7 @@ UserFacingNameToSystemCharacteristics = {
836
847
  device_type='n2-standard-32-1',
837
848
  supports_sub_slicing=False,
838
849
  supports_super_slicing=False,
850
+ supports_accelerator_network_profile=False,
839
851
  docker_platform=AMD_PLATFORM,
840
852
  ),
841
853
  'n2-standard-32-2': SystemCharacteristics(
@@ -848,6 +860,7 @@ UserFacingNameToSystemCharacteristics = {
848
860
  device_type='n2-standard-32-2',
849
861
  supports_sub_slicing=False,
850
862
  supports_super_slicing=False,
863
+ supports_accelerator_network_profile=False,
851
864
  docker_platform=AMD_PLATFORM,
852
865
  ),
853
866
  'n2-standard-32-4': SystemCharacteristics(
@@ -860,6 +873,7 @@ UserFacingNameToSystemCharacteristics = {
860
873
  device_type='n2-standard-32-4',
861
874
  supports_sub_slicing=False,
862
875
  supports_super_slicing=False,
876
+ supports_accelerator_network_profile=False,
863
877
  docker_platform=AMD_PLATFORM,
864
878
  ),
865
879
  'n2-standard-32-8': SystemCharacteristics(
@@ -872,6 +886,7 @@ UserFacingNameToSystemCharacteristics = {
872
886
  device_type='n2-standard-32-8',
873
887
  supports_sub_slicing=False,
874
888
  supports_super_slicing=False,
889
+ supports_accelerator_network_profile=False,
875
890
  docker_platform=AMD_PLATFORM,
876
891
  ),
877
892
  'n2-standard-32-16': SystemCharacteristics(
@@ -884,6 +899,7 @@ UserFacingNameToSystemCharacteristics = {
884
899
  device_type='n2-standard-32-16',
885
900
  supports_sub_slicing=False,
886
901
  supports_super_slicing=False,
902
+ supports_accelerator_network_profile=False,
887
903
  docker_platform=AMD_PLATFORM,
888
904
  ),
889
905
  'n2-standard-32-32': SystemCharacteristics(
@@ -896,6 +912,7 @@ UserFacingNameToSystemCharacteristics = {
896
912
  device_type='n2-standard-32-32',
897
913
  supports_sub_slicing=False,
898
914
  supports_super_slicing=False,
915
+ supports_accelerator_network_profile=False,
899
916
  docker_platform=AMD_PLATFORM,
900
917
  ),
901
918
  'n2-standard-32-64': SystemCharacteristics(
@@ -908,6 +925,7 @@ UserFacingNameToSystemCharacteristics = {
908
925
  device_type='n2-standard-32-64',
909
926
  supports_sub_slicing=False,
910
927
  supports_super_slicing=False,
928
+ supports_accelerator_network_profile=False,
911
929
  docker_platform=AMD_PLATFORM,
912
930
  ),
913
931
  'n2-standard-32-128': SystemCharacteristics(
@@ -920,6 +938,7 @@ UserFacingNameToSystemCharacteristics = {
920
938
  device_type='n2-standard-32-128',
921
939
  supports_sub_slicing=False,
922
940
  supports_super_slicing=False,
941
+ supports_accelerator_network_profile=False,
923
942
  docker_platform=AMD_PLATFORM,
924
943
  ),
925
944
  'n2-standard-32-256': SystemCharacteristics(
@@ -932,6 +951,7 @@ UserFacingNameToSystemCharacteristics = {
932
951
  device_type='n2-standard-32-256',
933
952
  supports_sub_slicing=False,
934
953
  supports_super_slicing=False,
954
+ supports_accelerator_network_profile=False,
935
955
  docker_platform=AMD_PLATFORM,
936
956
  ),
937
957
  'n2-standard-32-512': SystemCharacteristics(
@@ -944,6 +964,7 @@ UserFacingNameToSystemCharacteristics = {
944
964
  device_type='n2-standard-32-512',
945
965
  supports_sub_slicing=False,
946
966
  supports_super_slicing=False,
967
+ supports_accelerator_network_profile=False,
947
968
  docker_platform=AMD_PLATFORM,
948
969
  ),
949
970
  'n2-standard-32-1024': SystemCharacteristics(
@@ -956,6 +977,7 @@ UserFacingNameToSystemCharacteristics = {
956
977
  device_type='n2-standard-32-1024',
957
978
  supports_sub_slicing=False,
958
979
  supports_super_slicing=False,
980
+ supports_accelerator_network_profile=False,
959
981
  docker_platform=AMD_PLATFORM,
960
982
  ),
961
983
  'n2-standard-32-2048': SystemCharacteristics(
@@ -968,6 +990,7 @@ UserFacingNameToSystemCharacteristics = {
968
990
  device_type='n2-standard-32-2048',
969
991
  supports_sub_slicing=False,
970
992
  supports_super_slicing=False,
993
+ supports_accelerator_network_profile=False,
971
994
  docker_platform=AMD_PLATFORM,
972
995
  ),
973
996
  }
@@ -34,6 +34,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
34
34
  supported_topologies=["1x1"],
35
35
  docker_platform=DockerPlatform.AMD,
36
36
  tpu_type_requires_workload_policy=False,
37
+ supports_accelerator_network_profile=False,
37
38
  )
38
39
 
39
40
  expected_system_characteristics = SystemCharacteristics(
@@ -48,6 +49,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
48
49
  supports_super_slicing=False,
49
50
  docker_platform=DockerPlatform.AMD,
50
51
  requires_workload_policy=False,
52
+ supports_accelerator_network_profile=False,
51
53
  )
52
54
  assert result == {
53
55
  "test-1": expected_system_characteristics,
@@ -62,6 +64,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
62
64
  gke_accelerator="test",
63
65
  machine_type="test",
64
66
  supported_topologies=["2x2"],
67
+ supports_accelerator_network_profile=False,
65
68
  docker_platform=DockerPlatform.AMD,
66
69
  tpu_type_requires_workload_policy=True,
67
70
  )
@@ -76,6 +79,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
76
79
  device_type="test-8",
77
80
  supports_sub_slicing=False,
78
81
  supports_super_slicing=False,
82
+ supports_accelerator_network_profile=False,
79
83
  docker_platform=DockerPlatform.AMD,
80
84
  requires_workload_policy=False,
81
85
  )
@@ -94,6 +98,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
94
98
  supported_topologies=["2x2x2"],
95
99
  docker_platform=DockerPlatform.AMD,
96
100
  tpu_type_requires_workload_policy=True,
101
+ supports_accelerator_network_profile=False,
97
102
  )
98
103
 
99
104
  expected_system_characteristics = SystemCharacteristics(
@@ -106,6 +111,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
106
111
  device_type="test-16",
107
112
  supports_sub_slicing=False,
108
113
  supports_super_slicing=False,
114
+ supports_accelerator_network_profile=False,
109
115
  docker_platform=DockerPlatform.AMD,
110
116
  requires_workload_policy=True,
111
117
  )
@@ -122,6 +128,7 @@ def test_get_tpu_system_characteristics_map_sets_sub_slicing_support():
122
128
  gke_accelerator="test",
123
129
  machine_type="test",
124
130
  supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
131
+ supports_accelerator_network_profile=False,
125
132
  docker_platform=DockerPlatform.AMD,
126
133
  sub_slicing_topologies=set(["4x4x8", "4x4x16"]),
127
134
  )
@@ -138,6 +145,7 @@ def test_get_tpu_system_characteristics_map_sets_super_slicing_support():
138
145
  gke_accelerator="test",
139
146
  machine_type="test",
140
147
  supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
148
+ supports_accelerator_network_profile=False,
141
149
  docker_platform=DockerPlatform.AMD,
142
150
  super_slicing_topologies=set(["4x4x8", "4x4x16"]),
143
151
  )
@@ -154,6 +162,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
154
162
  gke_accelerator="test",
155
163
  machine_type="test",
156
164
  supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
165
+ supports_accelerator_network_profile=False,
157
166
  docker_platform=DockerPlatform.AMD,
158
167
  default_topologies=set(["4x8x16"]),
159
168
  )
@@ -206,6 +215,7 @@ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
206
215
  device_type="l4-1",
207
216
  supports_sub_slicing=False,
208
217
  supports_super_slicing=False,
218
+ supports_accelerator_network_profile=False,
209
219
  docker_platform=DockerPlatform.AMD,
210
220
  gpu_config=GpuConfig(requires_topology=False),
211
221
  )
@@ -225,5 +235,6 @@ def test_system_characteristics_post_init_throws_for_gpu_without_config():
225
235
  device_type="l4-1",
226
236
  supports_sub_slicing=False,
227
237
  supports_super_slicing=False,
238
+ supports_accelerator_network_profile=False,
228
239
  docker_platform=DockerPlatform.AMD,
229
240
  )
@@ -18,21 +18,6 @@ import yaml
18
18
  from ...utils.yaml import literal_string
19
19
 
20
20
 
21
- def decorate_kjob_template(job_manifest: dict) -> dict:
22
- spec = (
23
- job_manifest.setdefault('spec', {})
24
- .setdefault('template', {})
25
- .setdefault('spec', {})
26
- )
27
- spec.setdefault('tolerations', [])
28
- spec.setdefault('volumes', [])
29
-
30
- add_volumes(job_manifest)
31
- add_tolerations(job_manifest)
32
- update_gpu_containers(job_manifest)
33
- return job_manifest
34
-
35
-
36
21
  def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
37
22
  """
38
23
  Decorates a JobSet manifest with the necessary components for rdma-daemon.
@@ -22,14 +22,6 @@ from ...utils.yaml import literal_string
22
22
  tcpx = 'v2.0.11'
23
23
 
24
24
 
25
- def decorate_kjob_template(job_manifest: dict) -> dict:
26
- add_volumes(job_manifest)
27
- add_tolerations(job_manifest)
28
- add_tcpx_daemon_container(job_manifest)
29
- update_gpu_containers(job_manifest)
30
- return job_manifest
31
-
32
-
33
25
  def decorate_job(job_manifest: dict) -> dict:
34
26
  add_annotations(job_manifest)
35
27
  add_volumes(job_manifest)
@@ -47,24 +47,6 @@ spec:
47
47
  image: my-sidecar-image
48
48
  """
49
49
 
50
- # Minimal kjob template for testing
51
- BASE_KJOB_TEMPLATE = {
52
- "spec": {
53
- "template": {
54
- "spec": {
55
- "containers": [
56
- {
57
- "name": "main-gpu-container",
58
- "image": "my-gpu-image",
59
- "resources": {"limits": {"nvidia.com/gpu": 8}},
60
- },
61
- {"name": "sidecar-container", "image": "my-sidecar-image"},
62
- ]
63
- }
64
- }
65
- }
66
- }
67
-
68
50
  # Minimal job manifest for testing
69
51
  BASE_JOB_MANIFEST = {
70
52
  "spec": {
@@ -205,63 +187,3 @@ def test_decorate_job():
205
187
  assert "devices.gke.io/container.tcpx-daemon" in annotations
206
188
  assert "networking.gke.io/default-interface" in annotations
207
189
  assert "networking.gke.io/interfaces" in annotations
208
-
209
-
210
- def test_decorate_kjob_template():
211
- """Tests decorate_kjob_template."""
212
- kjob_template = copy.deepcopy(BASE_KJOB_TEMPLATE)
213
-
214
- decorated_manifest = tcpx_decorator.decorate_kjob_template(kjob_template)
215
-
216
- pod_template_spec = decorated_manifest["spec"]["template"]["spec"]
217
-
218
- # Check annotations are NOT added
219
- assert "annotations" not in decorated_manifest["spec"]["template"].get(
220
- "metadata", {}
221
- )
222
-
223
- # Check tolerations
224
- tolerations = pod_template_spec["tolerations"]
225
- assert {
226
- "key": "user-workload",
227
- "operator": "Equal",
228
- "value": "true",
229
- "effect": "NoSchedule",
230
- } in tolerations
231
-
232
- # Check volumes
233
- volumes = pod_template_spec["volumes"]
234
- volume_names = {v["name"] for v in volumes}
235
- assert "libraries" in volume_names
236
- assert "sys" in volume_names
237
- assert "proc-sys" in volume_names
238
- assert "tcpx-socket" in volume_names
239
- assert "dshm" in volume_names
240
-
241
- # Check init container
242
- init_containers = pod_template_spec["initContainers"]
243
- assert len(init_containers) == 1
244
- tcpx_daemon = init_containers[0]
245
- assert tcpx_daemon["name"] == "tcpx-daemon"
246
- assert tcpx_daemon["image"].endswith(f":{tcpx_decorator.tcpx}")
247
-
248
- # Check GPU container update
249
- gpu_container = pod_template_spec["containers"][0]
250
- assert gpu_container["name"] == "main-gpu-container"
251
-
252
- # Check env
253
- env_vars = {e["name"]: e["value"] for e in gpu_container["env"]}
254
- assert env_vars["LD_LIBRARY_PATH"] == "/usr/local/nvidia/lib64"
255
-
256
- # Check volume mounts
257
- volume_mounts = {
258
- vm["name"]: vm["mountPath"] for vm in gpu_container["volumeMounts"]
259
- }
260
- assert volume_mounts["tcpx-socket"] == "/tmp"
261
- assert volume_mounts["libraries"] == "/usr/local/nvidia/lib64"
262
- assert volume_mounts["dshm"] == "/dev/shm"
263
-
264
- # Check non-GPU container is not updated
265
- sidecar_container = pod_template_spec["containers"][1]
266
- assert "env" not in sidecar_container
267
- assert "volumeMounts" not in sidecar_container
@@ -22,22 +22,6 @@ from ...utils.yaml import literal_string
22
22
  rxdm = 'v1.0.12'
23
23
 
24
24
 
25
- def decorate_kjob_template(job_manifest: dict) -> dict:
26
- spec = (
27
- job_manifest.setdefault('spec', {})
28
- .setdefault('template', {})
29
- .setdefault('spec', {})
30
- )
31
- spec.setdefault('tolerations', [])
32
- spec.setdefault('volumes', [])
33
-
34
- add_volumes(job_manifest)
35
- add_tolerations(job_manifest)
36
- add_tcpxo_daemon_container(job_manifest)
37
- update_gpu_containers(job_manifest)
38
- return job_manifest
39
-
40
-
41
25
  def decorate_job(job_manifest: dict, sub_networks: list[str]) -> dict:
42
26
  job_manifest.setdefault('spec', {}).setdefault('template', {}).setdefault(
43
27
  'metadata', {}
xpk/parser/common.py CHANGED
@@ -144,23 +144,6 @@ def add_cluster_arguments(
144
144
  )
145
145
 
146
146
 
147
- def add_kind_cluster_arguments(
148
- custom_parser_or_group: ParserOrArgumentGroup,
149
- ) -> None:
150
- """Add kind cluster arguments to the parser or argument group.
151
-
152
- Args:
153
- custom_parser_or_group: parser or argument group to add shared arguments to.
154
- """
155
- custom_parser_or_group.add_argument(
156
- '--kind-cluster',
157
- type=bool,
158
- action=argparse.BooleanOptionalAction,
159
- default=False,
160
- help='Apply command to a local test cluster.',
161
- )
162
-
163
-
164
147
  def add_global_arguments(custom_parser_or_group: ParserOrArgumentGroup):
165
148
  """Add global - no cloud dependent - arguments to the parser.
166
149
 
xpk/parser/core.py CHANGED
@@ -23,13 +23,8 @@ from .cluster import set_cluster_parser
23
23
  from .inspector import set_inspector_parser
24
24
  from .storage import set_storage_parser
25
25
  from .workload import set_workload_parsers
26
- from .batch import set_batch_parser
27
- from .job import set_job_parser
28
26
  from .info import set_info_parser
29
- from .kind import set_kind_parser
30
- from .shell import set_shell_parser
31
27
  from .version import set_version_parser
32
- from .run import set_run_parser
33
28
 
34
29
 
35
30
  def set_parser(parser: argparse.ArgumentParser):
@@ -54,20 +49,6 @@ def set_parser(parser: argparse.ArgumentParser):
54
49
  "info",
55
50
  help="Commands around listing kueue clusterqueues and localqueues.",
56
51
  )
57
- batch_parser = xpk_subcommands.add_parser(
58
- "batch",
59
- help="commands around running batch job",
60
- )
61
- job_parser = xpk_subcommands.add_parser(
62
- "job", help="commands around listing, cancelling and investigating jobs"
63
- )
64
- kind_parser = xpk_subcommands.add_parser(
65
- "kind",
66
- help="commands around Kind cluster management",
67
- )
68
- shell_parser = xpk_subcommands.add_parser(
69
- "shell", help="Commands around configuring and using interactive shell."
70
- )
71
52
  version_parser = xpk_subcommands.add_parser(
72
53
  "version", help="Command to get xpk version"
73
54
  )
@@ -76,11 +57,6 @@ def set_parser(parser: argparse.ArgumentParser):
76
57
  "config", help="Commands to set and retrieve values from xpk config."
77
58
  )
78
59
 
79
- run_parser = xpk_subcommands.add_parser(
80
- "run",
81
- help="Command to run parallel jobs",
82
- )
83
-
84
60
  def default_subcommand_function(
85
61
  _args,
86
62
  ) -> int: # args is unused, so pylint: disable=invalid-name
@@ -96,14 +72,9 @@ def set_parser(parser: argparse.ArgumentParser):
96
72
  parser.print_help()
97
73
  cluster_parser.print_help()
98
74
  workload_parser.print_help()
99
- batch_parser.print_help()
100
75
  info_parser.print_help()
101
- job_parser.print_help()
102
- shell_parser.print_help()
103
76
  version_parser.print_help()
104
- kind_parser.print_help()
105
77
  config_parser.print_help()
106
- run_parser.print_help()
107
78
 
108
79
  storage_parser.print_help()
109
80
  return 0
@@ -111,25 +82,15 @@ def set_parser(parser: argparse.ArgumentParser):
111
82
  parser.set_defaults(func=default_subcommand_function)
112
83
  workload_parser.set_defaults(func=default_subcommand_function)
113
84
  cluster_parser.set_defaults(func=default_subcommand_function)
114
- batch_parser.set_defaults(func=default_subcommand_function)
115
85
  info_parser.set_defaults(func=default_subcommand_function)
116
- job_parser.set_defaults(func=default_subcommand_function)
117
- kind_parser.set_defaults(func=default_subcommand_function)
118
- shell_parser.set_defaults(func=default_subcommand_function)
119
86
  storage_parser.set_defaults(func=default_subcommand_function)
120
87
  version_parser.set_defaults(func=default_subcommand_function)
121
88
  config_parser.set_defaults(func=default_subcommand_function)
122
- run_parser.set_defaults(func=default_subcommand_function)
123
89
 
124
90
  set_workload_parsers(workload_parser=workload_parser)
125
91
  set_cluster_parser(cluster_parser=cluster_parser)
126
92
  set_inspector_parser(inspector_parser=inspector_parser)
127
- set_batch_parser(batch_parser=batch_parser)
128
93
  set_info_parser(info_parser=info_parser)
129
- set_job_parser(job_parser=job_parser)
130
- set_kind_parser(kind_parser=kind_parser)
131
- set_shell_parser(shell_parser=shell_parser)
132
94
  set_storage_parser(storage_parser=storage_parser)
133
95
  set_version_parser(version_parser=version_parser)
134
96
  set_config_parsers(config_parser=config_parser)
135
- set_run_parser(run_parser=run_parser)