xpk 0.17.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. xpk/commands/cluster.py +33 -43
  2. xpk/commands/cluster_gcluster.py +19 -14
  3. xpk/commands/cluster_gcluster_test.py +2 -0
  4. xpk/commands/cluster_test.py +1 -21
  5. xpk/commands/common.py +39 -6
  6. xpk/commands/common_test.py +170 -0
  7. xpk/commands/info.py +9 -5
  8. xpk/commands/inspector.py +33 -4
  9. xpk/commands/inspector_test.py +142 -0
  10. xpk/commands/workload.py +32 -11
  11. xpk/commands/workload_test.py +71 -3
  12. xpk/core/blueprint/blueprint_generator.py +19 -8
  13. xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
  14. xpk/core/blueprint/testing/data/a4.yaml +3 -1
  15. xpk/core/capacity.py +37 -17
  16. xpk/core/capacity_test.py +66 -1
  17. xpk/core/cluster.py +11 -10
  18. xpk/core/cluster_private.py +3 -3
  19. xpk/core/cluster_test.py +29 -2
  20. xpk/core/config.py +5 -2
  21. xpk/core/docker_container.py +31 -24
  22. xpk/core/docker_manager.py +4 -4
  23. xpk/core/docker_resources.py +4 -1
  24. xpk/core/kueue_manager.py +6 -8
  25. xpk/core/kueue_manager_test.py +6 -5
  26. xpk/core/nap.py +14 -3
  27. xpk/core/nodepool.py +52 -13
  28. xpk/core/nodepool_test.py +147 -8
  29. xpk/core/remote_state/fuse_remote_state.py +1 -1
  30. xpk/core/scheduling.py +32 -4
  31. xpk/core/scheduling_test.py +39 -2
  32. xpk/core/system_characteristics.py +44 -0
  33. xpk/core/system_characteristics_test.py +11 -0
  34. xpk/core/telemetry.py +11 -1
  35. xpk/core/telemetry_test.py +39 -0
  36. xpk/core/testing/commands_tester.py +26 -0
  37. xpk/core/testing/commands_tester_test.py +20 -1
  38. xpk/core/workload_decorators/rdma_decorator.py +9 -0
  39. xpk/parser/cluster.py +11 -1
  40. xpk/parser/cluster_test.py +59 -1
  41. xpk/parser/common.py +11 -17
  42. xpk/parser/core.py +0 -8
  43. xpk/parser/storage.py +3 -14
  44. xpk/utils/console.py +1 -1
  45. xpk/utils/feature_flags.py +8 -4
  46. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/METADATA +50 -23
  47. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/RECORD +51 -60
  48. xpk-1.1.0.dist-info/top_level.txt +1 -0
  49. integration/README.md +0 -19
  50. integration/__init__.py +0 -15
  51. integration/docker_manager_test.py +0 -102
  52. integration/gcluster_a3mega_test.py +0 -215
  53. integration/gcluster_a3ultra_test.py +0 -187
  54. integration/gcluster_a4_test.py +0 -187
  55. integration/gcluster_test.py +0 -107
  56. xpk/commands/kind.py +0 -265
  57. xpk/parser/kind.py +0 -95
  58. xpk/utils/user_input.py +0 -48
  59. xpk/utils/user_input_test.py +0 -92
  60. xpk-0.17.3.dist-info/top_level.txt +0 -2
  61. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
  62. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
  63. {xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -131,6 +131,8 @@ class SystemCharacteristics:
131
131
  supports_super_slicing: Whether the Super-slicing feature is supported.
132
132
  requires_workload_policy: A boolean indicating if a GCE resource
133
133
  workload policy is required. This is automatically set to True for GPUs.
134
+ parallel_containers: The number of containers running on a single VM.
135
+
134
136
  """
135
137
 
136
138
  topology: str
@@ -142,9 +144,11 @@ class SystemCharacteristics:
142
144
  device_type: str
143
145
  supports_sub_slicing: bool
144
146
  supports_super_slicing: bool
147
+ supports_accelerator_network_profile: bool
145
148
  docker_platform: DockerPlatform
146
149
  requires_workload_policy: bool = False
147
150
  gpu_config: Optional[GpuConfig] = None
151
+ parallel_containers: int = 1
148
152
 
149
153
  def __post_init__(self):
150
154
  if self.accelerator_type == AcceleratorType.GPU:
@@ -233,10 +237,12 @@ def get_tpu_system_characteristics_map(
233
237
  machine_type: str,
234
238
  supported_topologies: list[str],
235
239
  docker_platform: DockerPlatform,
240
+ supports_accelerator_network_profile: bool,
236
241
  tpu_type_requires_workload_policy: bool = False,
237
242
  default_topologies: set[str] | None = None,
238
243
  sub_slicing_topologies: set[str] | None = None,
239
244
  super_slicing_topologies: set[str] | None = None,
245
+ parallel_containers: int = 1,
240
246
  ) -> dict[str, SystemCharacteristics]:
241
247
  system_characteristics_map = {}
242
248
  default_topologies = default_topologies or set()
@@ -259,7 +265,9 @@ def get_tpu_system_characteristics_map(
259
265
  and vms_per_slice > 1,
260
266
  supports_sub_slicing=topology in sub_slicing_topologies,
261
267
  supports_super_slicing=topology in super_slicing_topologies,
268
+ supports_accelerator_network_profile=supports_accelerator_network_profile,
262
269
  docker_platform=docker_platform,
270
+ parallel_containers=parallel_containers,
263
271
  )
264
272
  system_characteristics_map[f'{prefix}-{topology}'] = system
265
273
  if (
@@ -303,6 +311,7 @@ UserFacingNameToSystemCharacteristics = {
303
311
  device_type='l4-1',
304
312
  supports_sub_slicing=False,
305
313
  supports_super_slicing=False,
314
+ supports_accelerator_network_profile=False,
306
315
  gpu_config=GpuConfig(requires_topology=False),
307
316
  docker_platform=AMD_PLATFORM,
308
317
  ),
@@ -316,6 +325,7 @@ UserFacingNameToSystemCharacteristics = {
316
325
  device_type='l4-2',
317
326
  supports_sub_slicing=False,
318
327
  supports_super_slicing=False,
328
+ supports_accelerator_network_profile=False,
319
329
  gpu_config=GpuConfig(requires_topology=False),
320
330
  docker_platform=AMD_PLATFORM,
321
331
  ),
@@ -329,6 +339,7 @@ UserFacingNameToSystemCharacteristics = {
329
339
  device_type='l4-4',
330
340
  supports_sub_slicing=False,
331
341
  supports_super_slicing=False,
342
+ supports_accelerator_network_profile=False,
332
343
  gpu_config=GpuConfig(requires_topology=False),
333
344
  docker_platform=AMD_PLATFORM,
334
345
  ),
@@ -342,6 +353,7 @@ UserFacingNameToSystemCharacteristics = {
342
353
  device_type='l4-8',
343
354
  supports_sub_slicing=False,
344
355
  supports_super_slicing=False,
356
+ supports_accelerator_network_profile=False,
345
357
  gpu_config=GpuConfig(requires_topology=False),
346
358
  docker_platform=AMD_PLATFORM,
347
359
  ),
@@ -356,6 +368,7 @@ UserFacingNameToSystemCharacteristics = {
356
368
  device_type='a100-40gb-1',
357
369
  supports_sub_slicing=False,
358
370
  supports_super_slicing=False,
371
+ supports_accelerator_network_profile=False,
359
372
  gpu_config=GpuConfig(requires_topology=False),
360
373
  docker_platform=AMD_PLATFORM,
361
374
  ),
@@ -369,6 +382,7 @@ UserFacingNameToSystemCharacteristics = {
369
382
  device_type='a100-40gb-2',
370
383
  supports_sub_slicing=False,
371
384
  supports_super_slicing=False,
385
+ supports_accelerator_network_profile=False,
372
386
  gpu_config=GpuConfig(requires_topology=False),
373
387
  docker_platform=AMD_PLATFORM,
374
388
  ),
@@ -382,6 +396,7 @@ UserFacingNameToSystemCharacteristics = {
382
396
  device_type='a100-40gb-4',
383
397
  supports_sub_slicing=False,
384
398
  supports_super_slicing=False,
399
+ supports_accelerator_network_profile=False,
385
400
  gpu_config=GpuConfig(requires_topology=False),
386
401
  docker_platform=AMD_PLATFORM,
387
402
  ),
@@ -395,6 +410,7 @@ UserFacingNameToSystemCharacteristics = {
395
410
  device_type='a100-40gb-8',
396
411
  supports_sub_slicing=False,
397
412
  supports_super_slicing=False,
413
+ supports_accelerator_network_profile=False,
398
414
  gpu_config=GpuConfig(requires_topology=False),
399
415
  docker_platform=AMD_PLATFORM,
400
416
  ),
@@ -408,6 +424,7 @@ UserFacingNameToSystemCharacteristics = {
408
424
  device_type='gb200-4',
409
425
  supports_sub_slicing=False,
410
426
  supports_super_slicing=False,
427
+ supports_accelerator_network_profile=True,
411
428
  gpu_config=GpuConfig(
412
429
  requires_topology=True,
413
430
  nccl_installer=INSTALLER_NCCL_RDMA_A4X,
@@ -426,6 +443,7 @@ UserFacingNameToSystemCharacteristics = {
426
443
  device_type='gb200-4',
427
444
  supports_sub_slicing=False,
428
445
  supports_super_slicing=False,
446
+ supports_accelerator_network_profile=True,
429
447
  gpu_config=GpuConfig(
430
448
  requires_topology=True,
431
449
  nccl_installer=INSTALLER_NCCL_RDMA_A4X,
@@ -444,6 +462,7 @@ UserFacingNameToSystemCharacteristics = {
444
462
  device_type='b200-8',
445
463
  supports_sub_slicing=False,
446
464
  supports_super_slicing=False,
465
+ supports_accelerator_network_profile=True,
447
466
  gpu_config=GpuConfig(
448
467
  requires_topology=True,
449
468
  nccl_installer=INSTALLER_NCCL_RDMA,
@@ -462,6 +481,7 @@ UserFacingNameToSystemCharacteristics = {
462
481
  device_type='h200-141gb-8',
463
482
  supports_sub_slicing=False,
464
483
  supports_super_slicing=False,
484
+ supports_accelerator_network_profile=True,
465
485
  gpu_config=GpuConfig(
466
486
  requires_topology=True,
467
487
  nccl_installer=INSTALLER_NCCL_RDMA,
@@ -481,6 +501,7 @@ UserFacingNameToSystemCharacteristics = {
481
501
  device_type='h100-80gb-8',
482
502
  supports_sub_slicing=False,
483
503
  supports_super_slicing=False,
504
+ supports_accelerator_network_profile=True,
484
505
  gpu_config=GpuConfig(
485
506
  requires_topology=True,
486
507
  nccl_installer=INSTALLER_NCCL_TCPX,
@@ -500,6 +521,7 @@ UserFacingNameToSystemCharacteristics = {
500
521
  device_type='h100-mega-80gb-8',
501
522
  supports_sub_slicing=False,
502
523
  supports_super_slicing=False,
524
+ supports_accelerator_network_profile=True,
503
525
  gpu_config=GpuConfig(
504
526
  requires_topology=True,
505
527
  nccl_installer=INSTALLER_NCCL_TCPXO,
@@ -516,6 +538,7 @@ UserFacingNameToSystemCharacteristics = {
516
538
  machine_type='tpu7x-standard-1t',
517
539
  supported_topologies=['1x1x1'],
518
540
  tpu_type_requires_workload_policy=True,
541
+ supports_accelerator_network_profile=False,
519
542
  docker_platform=AMD_PLATFORM,
520
543
  ),
521
544
  **get_tpu_system_characteristics_map(
@@ -524,7 +547,9 @@ UserFacingNameToSystemCharacteristics = {
524
547
  gke_accelerator='tpu7x',
525
548
  machine_type='tpu7x-standard-4t',
526
549
  tpu_type_requires_workload_policy=True,
550
+ supports_accelerator_network_profile=False,
527
551
  docker_platform=AMD_PLATFORM,
552
+ parallel_containers=2,
528
553
  supported_topologies=generate_tpu_topologies(max_cubes=144),
529
554
  super_slicing_topologies=set(['4x4x4']),
530
555
  default_topologies=set([
@@ -635,6 +660,7 @@ UserFacingNameToSystemCharacteristics = {
635
660
  machine_type='ct6e-standard-1t',
636
661
  supported_topologies=['1x1'],
637
662
  docker_platform=AMD_PLATFORM,
663
+ supports_accelerator_network_profile=True,
638
664
  ),
639
665
  **get_tpu_system_characteristics_map(
640
666
  prefix='v6e',
@@ -644,6 +670,7 @@ UserFacingNameToSystemCharacteristics = {
644
670
  supported_topologies=['2x2'] + SUB_SLICING_TOPOLOGIES,
645
671
  sub_slicing_topologies=set(SUB_SLICING_TOPOLOGIES),
646
672
  docker_platform=AMD_PLATFORM,
673
+ supports_accelerator_network_profile=True,
647
674
  ),
648
675
  **get_tpu_system_characteristics_map(
649
676
  prefix='v5p',
@@ -652,6 +679,7 @@ UserFacingNameToSystemCharacteristics = {
652
679
  machine_type='ct5p-hightpu-4t',
653
680
  docker_platform=AMD_PLATFORM,
654
681
  supported_topologies=generate_tpu_topologies(max_cubes=140),
682
+ supports_accelerator_network_profile=False,
655
683
  default_topologies=set([
656
684
  '2x2x1',
657
685
  '2x2x2',
@@ -758,6 +786,7 @@ UserFacingNameToSystemCharacteristics = {
758
786
  machine_type='ct5lp-hightpu-4t',
759
787
  docker_platform=AMD_PLATFORM,
760
788
  supported_topologies=['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'],
789
+ supports_accelerator_network_profile=False,
761
790
  ),
762
791
  **get_tpu_system_characteristics_map(
763
792
  prefix='v4',
@@ -768,6 +797,7 @@ UserFacingNameToSystemCharacteristics = {
768
797
  supported_topologies=generate_tpu_topologies(
769
798
  max_cubes=64, enforce_nondecreasing=False
770
799
  ),
800
+ supports_accelerator_network_profile=False,
771
801
  default_topologies=set([
772
802
  '2x2x1',
773
803
  '2x2x2',
@@ -796,6 +826,7 @@ UserFacingNameToSystemCharacteristics = {
796
826
  device_type='m1-megamem-96-1',
797
827
  supports_sub_slicing=False,
798
828
  supports_super_slicing=False,
829
+ supports_accelerator_network_profile=False,
799
830
  docker_platform=AMD_PLATFORM,
800
831
  ),
801
832
  # n2-standard-#vCPUs-#VMs
@@ -809,6 +840,7 @@ UserFacingNameToSystemCharacteristics = {
809
840
  device_type='n2-standard-64-1',
810
841
  supports_sub_slicing=False,
811
842
  supports_super_slicing=False,
843
+ supports_accelerator_network_profile=False,
812
844
  docker_platform=AMD_PLATFORM,
813
845
  ),
814
846
  'n2-standard-32-1': SystemCharacteristics(
@@ -821,6 +853,7 @@ UserFacingNameToSystemCharacteristics = {
821
853
  device_type='n2-standard-32-1',
822
854
  supports_sub_slicing=False,
823
855
  supports_super_slicing=False,
856
+ supports_accelerator_network_profile=False,
824
857
  docker_platform=AMD_PLATFORM,
825
858
  ),
826
859
  'n2-standard-32-2': SystemCharacteristics(
@@ -833,6 +866,7 @@ UserFacingNameToSystemCharacteristics = {
833
866
  device_type='n2-standard-32-2',
834
867
  supports_sub_slicing=False,
835
868
  supports_super_slicing=False,
869
+ supports_accelerator_network_profile=False,
836
870
  docker_platform=AMD_PLATFORM,
837
871
  ),
838
872
  'n2-standard-32-4': SystemCharacteristics(
@@ -845,6 +879,7 @@ UserFacingNameToSystemCharacteristics = {
845
879
  device_type='n2-standard-32-4',
846
880
  supports_sub_slicing=False,
847
881
  supports_super_slicing=False,
882
+ supports_accelerator_network_profile=False,
848
883
  docker_platform=AMD_PLATFORM,
849
884
  ),
850
885
  'n2-standard-32-8': SystemCharacteristics(
@@ -857,6 +892,7 @@ UserFacingNameToSystemCharacteristics = {
857
892
  device_type='n2-standard-32-8',
858
893
  supports_sub_slicing=False,
859
894
  supports_super_slicing=False,
895
+ supports_accelerator_network_profile=False,
860
896
  docker_platform=AMD_PLATFORM,
861
897
  ),
862
898
  'n2-standard-32-16': SystemCharacteristics(
@@ -869,6 +905,7 @@ UserFacingNameToSystemCharacteristics = {
869
905
  device_type='n2-standard-32-16',
870
906
  supports_sub_slicing=False,
871
907
  supports_super_slicing=False,
908
+ supports_accelerator_network_profile=False,
872
909
  docker_platform=AMD_PLATFORM,
873
910
  ),
874
911
  'n2-standard-32-32': SystemCharacteristics(
@@ -881,6 +918,7 @@ UserFacingNameToSystemCharacteristics = {
881
918
  device_type='n2-standard-32-32',
882
919
  supports_sub_slicing=False,
883
920
  supports_super_slicing=False,
921
+ supports_accelerator_network_profile=False,
884
922
  docker_platform=AMD_PLATFORM,
885
923
  ),
886
924
  'n2-standard-32-64': SystemCharacteristics(
@@ -893,6 +931,7 @@ UserFacingNameToSystemCharacteristics = {
893
931
  device_type='n2-standard-32-64',
894
932
  supports_sub_slicing=False,
895
933
  supports_super_slicing=False,
934
+ supports_accelerator_network_profile=False,
896
935
  docker_platform=AMD_PLATFORM,
897
936
  ),
898
937
  'n2-standard-32-128': SystemCharacteristics(
@@ -905,6 +944,7 @@ UserFacingNameToSystemCharacteristics = {
905
944
  device_type='n2-standard-32-128',
906
945
  supports_sub_slicing=False,
907
946
  supports_super_slicing=False,
947
+ supports_accelerator_network_profile=False,
908
948
  docker_platform=AMD_PLATFORM,
909
949
  ),
910
950
  'n2-standard-32-256': SystemCharacteristics(
@@ -917,6 +957,7 @@ UserFacingNameToSystemCharacteristics = {
917
957
  device_type='n2-standard-32-256',
918
958
  supports_sub_slicing=False,
919
959
  supports_super_slicing=False,
960
+ supports_accelerator_network_profile=False,
920
961
  docker_platform=AMD_PLATFORM,
921
962
  ),
922
963
  'n2-standard-32-512': SystemCharacteristics(
@@ -929,6 +970,7 @@ UserFacingNameToSystemCharacteristics = {
929
970
  device_type='n2-standard-32-512',
930
971
  supports_sub_slicing=False,
931
972
  supports_super_slicing=False,
973
+ supports_accelerator_network_profile=False,
932
974
  docker_platform=AMD_PLATFORM,
933
975
  ),
934
976
  'n2-standard-32-1024': SystemCharacteristics(
@@ -941,6 +983,7 @@ UserFacingNameToSystemCharacteristics = {
941
983
  device_type='n2-standard-32-1024',
942
984
  supports_sub_slicing=False,
943
985
  supports_super_slicing=False,
986
+ supports_accelerator_network_profile=False,
944
987
  docker_platform=AMD_PLATFORM,
945
988
  ),
946
989
  'n2-standard-32-2048': SystemCharacteristics(
@@ -953,6 +996,7 @@ UserFacingNameToSystemCharacteristics = {
953
996
  device_type='n2-standard-32-2048',
954
997
  supports_sub_slicing=False,
955
998
  supports_super_slicing=False,
999
+ supports_accelerator_network_profile=False,
956
1000
  docker_platform=AMD_PLATFORM,
957
1001
  ),
958
1002
  }
@@ -34,6 +34,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
34
34
  supported_topologies=["1x1"],
35
35
  docker_platform=DockerPlatform.AMD,
36
36
  tpu_type_requires_workload_policy=False,
37
+ supports_accelerator_network_profile=False,
37
38
  )
38
39
 
39
40
  expected_system_characteristics = SystemCharacteristics(
@@ -48,6 +49,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
48
49
  supports_super_slicing=False,
49
50
  docker_platform=DockerPlatform.AMD,
50
51
  requires_workload_policy=False,
52
+ supports_accelerator_network_profile=False,
51
53
  )
52
54
  assert result == {
53
55
  "test-1": expected_system_characteristics,
@@ -62,6 +64,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
62
64
  gke_accelerator="test",
63
65
  machine_type="test",
64
66
  supported_topologies=["2x2"],
67
+ supports_accelerator_network_profile=False,
65
68
  docker_platform=DockerPlatform.AMD,
66
69
  tpu_type_requires_workload_policy=True,
67
70
  )
@@ -76,6 +79,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
76
79
  device_type="test-8",
77
80
  supports_sub_slicing=False,
78
81
  supports_super_slicing=False,
82
+ supports_accelerator_network_profile=False,
79
83
  docker_platform=DockerPlatform.AMD,
80
84
  requires_workload_policy=False,
81
85
  )
@@ -94,6 +98,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
94
98
  supported_topologies=["2x2x2"],
95
99
  docker_platform=DockerPlatform.AMD,
96
100
  tpu_type_requires_workload_policy=True,
101
+ supports_accelerator_network_profile=False,
97
102
  )
98
103
 
99
104
  expected_system_characteristics = SystemCharacteristics(
@@ -106,6 +111,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_top
106
111
  device_type="test-16",
107
112
  supports_sub_slicing=False,
108
113
  supports_super_slicing=False,
114
+ supports_accelerator_network_profile=False,
109
115
  docker_platform=DockerPlatform.AMD,
110
116
  requires_workload_policy=True,
111
117
  )
@@ -122,6 +128,7 @@ def test_get_tpu_system_characteristics_map_sets_sub_slicing_support():
122
128
  gke_accelerator="test",
123
129
  machine_type="test",
124
130
  supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
131
+ supports_accelerator_network_profile=False,
125
132
  docker_platform=DockerPlatform.AMD,
126
133
  sub_slicing_topologies=set(["4x4x8", "4x4x16"]),
127
134
  )
@@ -138,6 +145,7 @@ def test_get_tpu_system_characteristics_map_sets_super_slicing_support():
138
145
  gke_accelerator="test",
139
146
  machine_type="test",
140
147
  supported_topologies=["4x4x4", "4x4x8", "4x4x16"],
148
+ supports_accelerator_network_profile=False,
141
149
  docker_platform=DockerPlatform.AMD,
142
150
  super_slicing_topologies=set(["4x4x8", "4x4x16"]),
143
151
  )
@@ -154,6 +162,7 @@ def test_get_tpu_system_characteristics_map_prefers_default_topologies():
154
162
  gke_accelerator="test",
155
163
  machine_type="test",
156
164
  supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
165
+ supports_accelerator_network_profile=False,
157
166
  docker_platform=DockerPlatform.AMD,
158
167
  default_topologies=set(["4x8x16"]),
159
168
  )
@@ -206,6 +215,7 @@ def test_system_characteristics_post_init_sets_workload_policy_for_gpu():
206
215
  device_type="l4-1",
207
216
  supports_sub_slicing=False,
208
217
  supports_super_slicing=False,
218
+ supports_accelerator_network_profile=False,
209
219
  docker_platform=DockerPlatform.AMD,
210
220
  gpu_config=GpuConfig(requires_topology=False),
211
221
  )
@@ -225,5 +235,6 @@ def test_system_characteristics_post_init_throws_for_gpu_without_config():
225
235
  device_type="l4-1",
226
236
  supports_sub_slicing=False,
227
237
  supports_super_slicing=False,
238
+ supports_accelerator_network_profile=False,
228
239
  docker_platform=DockerPlatform.AMD,
229
240
  )
xpk/core/telemetry.py CHANGED
@@ -30,7 +30,7 @@ from dataclasses import dataclass
30
30
  from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
31
31
  from ..utils.execution_context import is_dry_run
32
32
  from ..utils.user_agent import get_user_agent
33
- from ..utils.feature_flags import FeatureFlags
33
+ from ..utils.feature_flags import FeatureFlags, is_tester
34
34
 
35
35
 
36
36
  def should_send_telemetry():
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
114
114
 
115
115
 
116
116
  class MetricsEventMetadataKey(Enum):
117
+ """Represents available metadata keys."""
118
+
117
119
  SESSION_ID = "XPK_SESSION_ID"
118
120
  DRY_RUN = "XPK_DRY_RUN"
119
121
  PYTHON_VERSION = "XPK_PYTHON_VERSION"
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
125
127
  RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
126
128
  RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
127
129
  LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
130
+ TESTER = "XPK_TESTER"
128
131
 
129
132
 
130
133
  @dataclass
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
230
233
  MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
231
234
  _is_running_from_source()
232
235
  ).lower(),
236
+ MetricsEventMetadataKey.TESTER: str(
237
+ is_tester() or _is_trash_execution()
238
+ ).lower(),
233
239
  }
234
240
 
235
241
 
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
241
247
  }
242
248
 
243
249
 
250
+ def _is_trash_execution() -> bool:
251
+ return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
252
+
253
+
244
254
  def _is_running_as_pip() -> bool:
245
255
  return os.path.basename(sys.argv[0]) == "xpk"
246
256
 
@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
30
30
  mocker.patch('time.time', side_effect=itertools.count())
31
31
  mocker.patch('platform.python_version', return_value='99.99.99')
32
32
  mocker.patch('os.path.basename', return_value='xpk.py')
33
+ mocker.patch('os.getenv', return_value='false')
33
34
  mocker.patch('os.path.abspath', return_value='/home/xpk_user')
35
+ mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
34
36
  set_dry_run(False)
35
37
  get_config().set(CLIENT_ID_KEY, 'client_id')
36
38
  yield
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
76
78
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
77
79
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
78
80
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
81
+ {'key': 'XPK_TESTER', 'value': 'false'},
79
82
  {'key': 'XPK_COMMAND', 'value': 'test'},
80
83
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
81
84
  ],
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
107
110
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
108
111
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
109
112
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
113
+ {'key': 'XPK_TESTER', 'value': 'false'},
110
114
  {'key': 'XPK_EXIT_CODE', 'value': '2'},
111
115
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
112
116
  ],
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
131
135
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
132
136
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
133
137
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
138
+ {'key': 'XPK_TESTER', 'value': 'false'},
134
139
  {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
135
140
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
136
141
  ],
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
219
224
  assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
220
225
 
221
226
 
227
+ @pytest.mark.parametrize(
228
+ argnames='tester,expected',
229
+ argvalues=[
230
+ (True, 'true'),
231
+ (False, 'false'),
232
+ ],
233
+ )
234
+ def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
235
+ tester: bool, expected: str, mocker: MockerFixture
236
+ ):
237
+ mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
238
+ MetricsCollector.log_start(command='test')
239
+ payload = MetricsCollector.flush()
240
+ assert _get_metadata_value(payload, 'XPK_TESTER') == expected
241
+
242
+
243
+ @pytest.mark.parametrize(
244
+ argnames='trash_execution,expected',
245
+ argvalues=[
246
+ ('true', 'true'),
247
+ ('false', 'false'),
248
+ ('', 'false'),
249
+ (None, 'false'),
250
+ ],
251
+ )
252
+ def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
253
+ trash_execution: str, expected: str, mocker: MockerFixture
254
+ ):
255
+ mocker.patch('os.getenv', return_value=trash_execution)
256
+ MetricsCollector.log_start(command='test')
257
+ payload = MetricsCollector.flush()
258
+ assert _get_metadata_value(payload, 'XPK_TESTER') == expected
259
+
260
+
222
261
  def _get_metadata_value(payload_str: str, key: str) -> str | None:
223
262
  payload = json.loads(payload_str)
224
263
  metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
@@ -17,6 +17,8 @@ limitations under the License.
17
17
  import re
18
18
  from pytest_mock import MockerFixture
19
19
 
20
+ from ..commands import FailedCommand
21
+
20
22
 
21
23
  class CommandsTester:
22
24
  """Tester class useful for mocking and asserting command runs."""
@@ -27,6 +29,7 @@ class CommandsTester:
27
29
  run_command_for_value_path: str | None = None,
28
30
  run_command_with_updates_path: str | None = None,
29
31
  run_command_with_updates_retry_path: str | None = None,
32
+ run_command_batch_path: str | None = None,
30
33
  ):
31
34
  self.__results: dict[re.Pattern, tuple[int, str]] = {}
32
35
  self.commands_history: list[str] = []
@@ -45,6 +48,11 @@ class CommandsTester:
45
48
  run_command_with_updates_retry_path,
46
49
  wraps=self.__fake_run_command_with_updates_retry,
47
50
  )
51
+ if run_command_batch_path:
52
+ mocker.patch(
53
+ run_command_batch_path,
54
+ wraps=self.__fake_run_command_batch,
55
+ )
48
56
 
49
57
  def set_result_for_command(
50
58
  self, result: tuple[int, str], *command_parts: str
@@ -111,6 +119,24 @@ class CommandsTester:
111
119
  ) -> tuple[int, str]:
112
120
  return self.__common_fake_run_command(command, (0, dry_run_return_val))
113
121
 
122
+ def __fake_run_command_batch(
123
+ self,
124
+ commands: list[str],
125
+ jobname: str,
126
+ per_command_name: list[str],
127
+ output_logs: list[str],
128
+ ) -> FailedCommand | None:
129
+ for i, command in enumerate(commands):
130
+ result = self.__common_fake_run_command(command, (0, ""))[0]
131
+ if result != 0:
132
+ return FailedCommand(
133
+ return_code=result,
134
+ name=per_command_name[i],
135
+ command=command,
136
+ logfile=output_logs[i],
137
+ )
138
+ return None
139
+
114
140
  # pylint: enable=unused-argument
115
141
 
116
142
  def __common_fake_run_command(
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import pytest
18
18
  from pytest_mock import MockerFixture
19
19
 
20
- from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
20
+ from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
21
21
  from xpk.core.testing.commands_tester import CommandsTester
22
22
 
23
23
 
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
31
31
  run_command_with_updates_retry_path=(
32
32
  "xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
33
33
  ),
34
+ run_command_batch_path=(
35
+ "xpk.core.testing.commands_tester_test.run_command_batch"
36
+ ),
34
37
  )
35
38
 
36
39
 
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
54
57
  mock_commands.assert_command_run("cmd", "bar")
55
58
 
56
59
 
60
+ def test_run_command_batch_default_result(
61
+ mock_commands: CommandsTester,
62
+ ):
63
+ result = run_command_batch(
64
+ commands=["cmd1 foo bar", "cmd2 foo bar"],
65
+ jobname="Test command",
66
+ per_command_name=["cmd1", "cmd2"],
67
+ output_logs=["log1", "log2"],
68
+ )
69
+
70
+ assert result is None
71
+ mock_commands.assert_command_run("foo bar", times=2)
72
+ mock_commands.assert_command_run("cmd1")
73
+ mock_commands.assert_command_run("cmd2")
74
+
75
+
57
76
  def test_set_result_for_command(mock_commands: CommandsTester):
58
77
  mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")
59
78
 
@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
84
84
  volumes.append(
85
85
  {'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
86
86
  )
87
+ volumes.append({
88
+ 'name': 'dshm',
89
+ 'emptyDir': {
90
+ 'medium': 'Memory',
91
+ },
92
+ })
87
93
 
88
94
 
89
95
  def add_tolerations(job_manifest):
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
111
117
  container['volumeMounts'].append(
112
118
  {'name': 'gib', 'mountPath': '/usr/local/gib'}
113
119
  )
120
+ container['volumeMounts'].append(
121
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
122
+ )
xpk/parser/cluster.py CHANGED
@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
338
338
  add_resource_limits(cluster_create_resource_limits)
339
339
 
340
340
  cluster_create_ray_parser.set_defaults(
341
- func=cluster_create_ray_cluster, sub_slicing=False, super_slicing=False
341
+ func=cluster_create_ray_cluster,
342
+ sub_slicing=False,
343
+ super_slicing=False,
344
+ num_cubes=None,
342
345
  )
343
346
 
344
347
 
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
503
506
  )
504
507
  add_driver_arguments(cluster_adapt_optional_arguments)
505
508
  add_shared_arguments(cluster_adapt_optional_arguments)
509
+ add_resource_limits(cluster_adapt_optional_arguments)
510
+
511
+ if FeatureFlags.SUB_SLICING_ENABLED:
512
+ add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
513
+
514
+ if FeatureFlags.SUPER_SLICING_ENABLED:
515
+ add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
506
516
 
507
517
  cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
508
518
  'Capacity Arguments', 'Arguments related to capacity for cluster create.'